def test_adwin(test_path): """ ADWIN drift detection test. The first half of the stream contains a sequence corresponding to a normal distribution of integers from 0 to 1. From index 999 to 1999 the sequence is a normal distribution of integers from 0 to 7. """ adwin = ADWIN() test_file = os.path.join(test_path, 'drift_stream.npy') data_stream = np.load(test_file) expected_indices = [1023, 1055, 1087, 1151] detected_indices = [] for i in range(data_stream.size): adwin.add_element(data_stream[i]) if adwin.detected_change(): detected_indices.append(i) assert detected_indices == expected_indices
def demo(): """ _test_adwin This demo will insert data into an ADWIN object when will display in which indexes change was detected. The data stream is simulated as a sequence of randomly generated 0's and 1's. Then the data from indexes 999 to 1999 is changed to a normal distribution of integers from 0 to 7. """ adwin = ADWIN() size = 2000 data_stream = np.random.randint(2, size=size) for i in range(999, size): data_stream[i] = np.random.randint(8) for i in range(size): adwin.add_element(data_stream[i]) if adwin.detected_change(): print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
class AdaLearningNode(LearningNodeNBAdaptive, NewNode): def __init__(self, initial_class_observations): LearningNodeNBAdaptive.__init__(self, initial_class_observations) self.estimationErrorWeight = ADWIN() self.ErrorChange = False self.randomSeed = 1 self.classifierRandom = random.seed(self.randomSeed) def calc_byte_size(self): byte_size = self.__sizeof__() if self.estimationErrorWeight is not None: byte_size += self.estimationErrorWeight.get_length_estimation() return byte_size # Override NewNode def number_leaves(self): return 1 # Override NewNode def get_error_estimation(self): return self.estimationErrorWeight._estimation # Override NewNode def get_error_width(self): return self.estimationErrorWeight._width # Override NewNode def is_null_error(self): return (self.estimationErrorWeight is None) def kill_tree_childs(self, hat): pass # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y k = np.random.poisson(1.0, self.classifierRandom) if k > 0: weight = weight * k tmp = self.get_class_votes(X, hat) class_prediction = get_max_value_index(tmp) bl_correct = (true_class == class_prediction) if self.estimationErrorWeight is None: self.estimationErrorWeight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self.estimationErrorWeight.add_element(add) # Detect change with Adwin self.ErrorChange = self.estimationErrorWeight.detected_change() if (self.ErrorChange is True and old_error > self.get_error_estimation()): self.ErrorChange = False # Update statistics call LearningNodeNBAdaptive super().learn_from_instance(X, y, weight, hat) # CHECK changed self to super # call ActiveLearningNode weight_seen = self.get_weight_seen() if weight_seen - self.get_weight_seen_at_last_split_evaluation( ) >= hat.grace_period: hat._attempt_to_split(self, parent, parent_branch) self.set_weight_seen_at_last_split_evaluation(weight_seen) # Override LearningNodeNBAdaptive def get_class_votes(self, X, ht): dist = {} prediction_option = ht.leaf_prediction if prediction_option == MAJORITY_CLASS: #MC dist = self.get_observed_class_distribution() elif prediction_option == NAIVE_BAYES: #NB dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) # NBAdaptive if self._mc_correct_weight > self._nb_correct_weight: dist = self.get_observed_class_distribution() else: dist = do_naive_bayes_prediction( X, self._observed_class_distribution, self._attribute_observers) dist_sum = sum(dist.values()) # sum all values in dictionary if dist_sum * self.get_error_estimation( ) * self.get_error_estimation() > 0.0: normalize_values_in_dict( dist_sum * self.get_error_estimation() * self.get_error_estimation(), dist) return dist # Override NewNode, New for option votes def filter_instance_to_leaves(self, X, split_parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] found_nodes.append( HoeffdingTree.FoundNode(self, split_parent, parent_branch))
class AdaSplitNode(SplitNode, NewNode): def __init__(self, split_test, class_observations, size): SplitNode.__init__(self, split_test, class_observations, size) self._estimation_error_weight = ADWIN() self._alternate_tree = None # CHECK not HoeffdingTree.Node(), I force alternatetree to be None so that will be that initialized as _new_learning_node (line 154) self.error_change = False self._random_seed = 1 self._classifier_random = random.seed(self._random_seed) # Override SplitNode def calc_byte_size_including_subtree(self): byte_size = self.__sizeof__() if self._alternate_tree is not None: byte_size += self._alternate_tree.calc_byte_size_including_subtree( ) if self._estimation_error_weight is not None: byte_size += self._estimation_error_weight.get_length_estimation( ) for child in self._children: if child is not None: byte_size += child.calc_byte_size_including_subtree() return byte_size # Override NewNode def number_leaves(self): num_of_leaves = 0 for child in self._children: if child is not None: num_of_leaves += child.number_leaves() return num_of_leaves # Override NewNode def get_error_estimation(self): return self._estimation_error_weight._estimation # Override NewNode def get_error_width(self): w = 0.0 if (self.is_null_error() is False): w = self._estimation_error_weight._width return w # Override NewNode def is_null_error(self): return (self._estimation_error_weight is None) # Override NewNode def learn_from_instance(self, X, y, weight, hat, parent, parent_branch): true_class = y class_prediction = 0 if (self.filter_instance_to_leaf(X, parent, parent_branch).node) is not None: class_prediction = get_max_value_index( self.filter_instance_to_leaf( X, parent, parent_branch).node.get_class_votes(X, hat)) bl_correct = (true_class == class_prediction) if self._estimation_error_weight is None: self._estimation_error_weight = ADWIN() old_error = self.get_error_estimation() # Add element to Adwin add = 0.0 if (bl_correct is True) else 1.0 self._estimation_error_weight.add_element(add) # Detect change with Adwin self.error_change = self._estimation_error_weight.detected_change() if (self.error_change is True and old_error > self.get_error_estimation()): self.error_change = False #Check condition to build a new alternate tree if (self.error_change is True): self._alternate_tree = hat._new_learning_node( ) # check call to new learning node hat._alternateTrees += 1 #Condition to replace alternate tree elif (self._alternate_tree is not None and self._alternate_tree.is_null_error() is False): if (self.get_error_width() > error_width_threshold and self._alternate_tree.get_error_width() > error_width_threshold): old_error_rate = self.get_error_estimation() alt_error_rate = self._alternate_tree.get_error_estimation( ) fDelta = .05 fN = 1.0 / self._alternate_tree.get_error_width() + 1.0 / ( self.get_error_width()) bound = math.sqrt(2.0 * old_error_rate * (1.0 - old_error_rate) * math.log(2.0 / fDelta) * fN) # To check, bound never less than (old_error_rate - alt_error_rate) if bound < (old_error_rate - alt_error_rate): hat._active_leaf_node_cnt -= self.number_leaves() hat._active_leaf_node_cnt += self._alternate_tree.number_leaves( ) self.kill_tree_childs(hat) if parent is not None: parent.set_child(parent_branch, self._alternate_tree) else: hat._tree_root = hat._tree_root.alternateTree hat._switchAlternateTrees += 1 elif (bound < alt_error_rate - old_error_rate): if isinstance(self._alternate_tree, HAT.ActiveLearningNode): self._alternate_tree = None elif (isinstance(self._alternate_tree, HAT.ActiveLearningNode)): self._alternate_tree = None else: self._alternate_tree.kill_tree_childs(hat) hat._prunedalternateTree += 1 # hat._pruned_alternate_trees to check # Learn_From_Instance alternate Tree and Child nodes if self._alternate_tree is not None: self._alternate_tree.learn_from_instance( X, y, weight, hat, parent, parent_branch) child_branch = self.instance_child_index(X) child = self.get_child(child_branch) if child is not None: child.learn_from_instance(X, y, weight, hat, parent, parent_branch) # Override NewNode def kill_tree_childs(self, hat): for child in self._children: if child is not None: # Delete alternate tree if it exists if (isinstance(child, HAT.AdaSplitNode) and child._alternate_tree is not None): self._pruned_alternate_trees += 1 # Recursive delete of SplitNodes if isinstance(child, HAT.AdaSplitNode): child.kill_tree_childs(hat) if isinstance(child, HAT.ActiveLearningNode): child = None hat._active_leaf_node_cnt -= 1 elif isinstance(child, HAT.InactiveLearningNode): child = None hat._inactive_leaf_node_cnt -= 1 # override NewNode def filter_instance_to_leaves(self, X, parent, parent_branch, update_splitter_counts, found_nodes=None): if found_nodes is None: found_nodes = [] child_index = self.instance_child_index(X) if child_index >= 0: child = self.get_child(child_index) if child is not None: child.filter_instance_to_leaves(X, parent, parent_branch, update_splitter_counts, found_nodes) else: found_nodes.append( HoeffdingTree.FoundNode(None, self, child_index)) if self._alternate_tree is not None: self._alternate_tree.filter_instance_to_leaves( X, self, -999, update_splitter_counts, found_nodes)
class KNNAdwin(KNN): """ K-Nearest Neighbors Classifier with ADWIN Change detector This Classifier is an improvement from the regular KNN classifier, as it is resistant to concept drift. It utilises the ADWIN change detector to decide which samples to keep and which ones to forget, and by doing so it regulates the sample window size. To know more about the ADWIN change detector, please visit skmultiflow.classification.core.driftdetection.adwin It uses the regular KNN Classifier as a base class, with the major difference that this class keeps a variable size window, instead of a fixed size one and also it updates the adwin algorithm at each partial_fit call. Parameters ---------- k: int The number of nearest neighbors to search for. max_window_size: int The maximum size of the window storing the last viewed samples. leaf_size: int The maximum number of samples that can be stored in one leaf node, which determines from which point the algorithm will switch for a brute-force approach. The bigger this number the faster the tree construction time, but the slower the query time will be. categorical_list: An array-like Each entry is the index of a categorical feature. May be requested further filtering. Raises ------ NotImplementedError: A few of the functions described here are not implemented since they have no application in this context. ValueError: A ValueError is raised if the predict function is called before at least k samples have been analyzed by the algorithm. Examples -------- >>> # Imports >>> from skmultiflow.classification.lazy.knn_adwin import KNNAdwin >>> from skmultiflow.classification.lazy.knn import KNN >>> from skmultiflow.data.file_stream import FileStream >>> from skmultiflow.options.file_option import FileOption >>> # Setting up the stream >>> opt = FileOption('FILE', 'OPT_NAME', 'skmultiflow/datasets/covtype.csv', 'csv', False) >>> stream = FileStream(opt, -1, 1) >>> stream.prepare_for_use() >>> # Setting up the KNNAdwin classifier >>> knn_adwin = KNNAdwin(k=8, leaf_size=40, max_window_size=2000) >>> # Pre training the classifier with 200 samples >>> X, y = stream.next_instance(200) >>> knn_adwin = knn_adwin.partial_fit(X, y) >>> # Keeping track of sample count and correct prediction count >>> n_samples = 0 >>> corrects = 0 >>> while n_samples < 5000: ... X, y = stream.next_instance() ... pred = knn_adwin.predict(X) ... if y[0] == pred[0]: ... corrects += 1 ... knn_adwin = knn_adwin.partial_fit(X, y) ... n_samples += 1 >>> >>> # Displaying the results >>> print('KNN usage example') >>> print(str(n_samples) + ' samples analyzed.') 5000 samples analyzed. >>> print("KNNAdwin's performance: " + str(corrects/n_samples)) KNNAdwin's performance: 0.7798 """ def __init__(self, k=5, max_window_size=sys.maxsize, leaf_size=30, categorical_list=[]): super().__init__(k=k, max_window_size=max_window_size, leaf_size=leaf_size, categorical_list=categorical_list) self.adwin = ADWIN() self.window = None def reset(self): """ reset Resets the adwin algorithm as well as the base model kept by the KNN base class. Returns ------- KNNAdwin self """ self.adwin = ADWIN() return super().reset() def partial_fit(self, X, y, classes=None, weight=None): """ partial_fit Partially fits the model. This is done by updating the window with new samples while also updating the adwin algorithm. Then we verify if a change was detected, and if so, the window is correctly split at the drift moment. Parameters ---------- X: Numpy.ndarray of shape (n_samples, n_features) The data upon which the algorithm will create its model. y: Array-like An array-like containing the classification targets for all samples in X. classes: Not used. weight: Not used. Returns ------- KNNAdwin self """ r, c = get_dimensions(X) if self.window is None: self.window = InstanceWindow(max_size=self.max_window_size) for i in range(r): if r > 1: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) else: self.window.add_element(np.asarray([X[i]]), np.asarray([[y[i]]])) if self.window._num_samples >= self.k: add = 1 if self.predict(np.asarray([X[i]])) == y[i] else 0 self.adwin.add_element(add) else: self.adwin.add_element(0) if self.window._num_samples >= self.k: changed = self.adwin.detected_change() if changed: if self.adwin._width < self.window._num_samples: for i in range(self.window._num_samples, self.adwin._width, -1): self.window.delete_element() return self
# Imports import numpy as np from skmultiflow.classification.core.driftdetection.adwin import ADWIN adwin = ADWIN() # Simulating a data stream as a normal distribution of 1's and 0's data_stream = np.random.randint(2, size=2000) # Changing the data concept from index 999 to 2000 for i in range(999, 2000): data_stream[i] = np.random.randint(4, high=8) # Adding stream elements to ADWIN and verifying if drift occurred for i in range(2000): adwin.add_element(data_stream[i]) if adwin.detected_change(): print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))