class AdaptiveTree(object): def __init__(self, tree, kappa_window, warning_delta, drift_delta, tree_pool_id=-1): self.tree_pool_id = tree_pool_id self.tree = tree self.bg_adaptive_tree = None self.is_candidate = False self.warning_detector = ADWIN(warning_delta) self.drift_detector = ADWIN(drift_delta) self.predicted_labels = deque(maxlen=kappa_window) self.kappa = -sys.maxsize self.kappa_window = kappa_window def update_kappa(self, actual_labels): if len(self.predicted_labels) < self.kappa_window: self.kappa = -sys.maxsize else: self.kappa = cohen_kappa_score(actual_labels, self.predicted_labels) return self.kappa def reset(self): self.bg_adaptive_tree = None self.is_candidate = False self.warning_detector.reset() self.drift_detector.reset() self.predicted_labels.clear() self.kappa = -sys.maxsize
class SADWINIsolationForestStream(BaseSKMObject, ClassifierMixin): """ This code implements Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window (Ding \& Fei, 2013) [3] Each sample has an anomaly score is computed based on Isolation Forest anomaly based approach [2]. The concept of Isolation forest [1] consists on isolating observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. Model is updated of a Drift has been detected based on a input drift threshold. The drift detection approach is proposed by [2] and works as follow : if the averaged anomaly score between two successive sliding windows is highter than the drift threshold (u), then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream. Parameters --------- n_estimators: int, optional (default=25) Number of trees in the ensemble. 't' in the original paper. window_size: int, optional (default=100) The window size of the stream. ψ, 'Psi' in the original paper. ## Optional anomaly_threshold: double, optional (default=0.5) The threshold for declaring anomalies. Any instance prediction probability above this threshold will be declared as an anomaly. drift_threshold: double, optional (default=0.5) The threshold for detecting Drift and update the model. If the averaged anomaly score between two successive sliding windows is highter than the threshold (u), then the previous model is completely discarded and a new model is build as an isolation forest on latest sliding windows stream. This parameters is supposed to be know by an expert domain, depending on data set. ## Other Attributes ensemble : Isolation Tree Ensemble Contain an Isolation Tree Ensemble object, current model for IsolationForestStream sample_size : int Number of sample seen since the update anomaly_rate : float Rate of the anomalies in the previous sliding window (AnomalyRate in the original paper iForestASD) prec_window & window : numpy.ndarray of shape (n_samples, self.window_size) The previous and current window of data cpt : int Counter, if the n_estimator is higher than its, it will fit References ---------- [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation forest.” Data Mining, 2008. ICDM’08. Eighth IEEE International Conference on. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation-based anomaly detection.” ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): self.n_estimators [3] Ding, Zhiguo. (2013) An Anomaly Detection Approach Based on Isolation Forest Algorithm for Streaming Data Using Sliding Window. 12-17. 10.3182/20130902-3-CN-3020.00044. """ def __init__( self, window_size=100, n_estimators=25, anomaly_threshold=0.5, drift_threshold=0.5, random_state=None, version="AnomalyRate", #Parameters for partial model update n_estimators_updated=0.5, updated_randomly=True, #Parameters for NDKSWIN alpha=0.01, data=None, n_dimensions=1, n_tested_samples=0.1, fixed_checked_dimension=False, fixed_checked_sample=False): super().__init__() self.n_estimators = n_estimators self.ensemble = None self.random_state = random_state self.window_size = window_size self.samples_seen = 0 self.anomaly_rate = 0.20 self.anomaly_threshold = anomaly_threshold self.drift_threshold = drift_threshold self.window = None self.prec_window = None self.cpt = 0 self.version = version self.model_update = [ ] #To count the number of times the model have been updated 0 Not updated and 1 updated self.model_update_windows = [ ] #To count the number of times the model have been updated 0 Not updated and 1 updated self.model_update.append( version ) #Initialisation to know the concerned version of IForestASD self.model_update_windows.append( "samples_seen_" + version ) #Initialisation to know the number of data seen in the window self.n_estimators_updated = int( self.n_estimators * n_estimators_updated ) # The percentage of new trees to compute when update on new window if n_estimators_updated <= 0.0 or n_estimators_updated > 1.0: raise ValueError("n_estimators_updated must be > 0 and <= 1") self.updated_randomly = updated_randomly # If we will choose randomly the trees: True for randomly, # False to pick the first (n_estimators- int(n_estimators*n_estimators_updated)) trees self.alpha = alpha self.n_dimensions = n_dimensions self.n_tested_samples = n_tested_samples self.fixed_checked_dimension = fixed_checked_dimension self.fixed_checked_sample = fixed_checked_sample self.first_time_fit = True # TODO Maurras 27112020: Find a way to optimize the use of ADWIN() self.adwin = ADWIN() def partial_fit(self, X, y, classes=None, sample_weight=None): """ Partially (incrementally) fit the model. Parameters ---------- X : numpy.ndarray of shape (n_samples, n_features) The features to train the model. y: numpy.ndarray of shape (n_samples) An array-like with the class labels of all samples in X. classes: None Not used by this method. sample_weight: None Not used by this method. Returns ------- self """ ## get the number of observations number_instances, _ = X.shape if (self.samples_seen == 0): ## ToDo ? Give a sample of self.window_size in attribute of iForest iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators, self.random_state) self.ensemble = iforest for i in range(number_instances): self._partial_fit(X[i], y[i]) return self def _partial_fit(self, X, y): """ Trains the model on samples X and corresponding targets y. Private function where actual training is carried on. Parameters ---------- X: numpy.ndarray of shape (1, n_features) Instance attributes. y: int Class label for sample X. Not used in this implementaion which is Unsupervised """ """ Reshape X and add it to our window if it isn't full. If it's full, give window to our precedent_window. If we are at the end our window, fit if we're learning Check the anomaly score of our window Update if self.anomaly_rate > self.drift_threshold """ X = np.reshape(X, (1, len(X))) if self.samples_seen % self.window_size == 0: ## Update the two windows (precedent one and current windows) self.prec_window = self.window self.window = X else: self.window = np.concatenate((self.window, X)) if self.samples_seen % self.window_size == 0 and self.samples_seen != 0: #Fit the ensemble if it's not empty #if(self.cpt<self.n_estimators): # self.ensemble.fit(self.prec_window) # self.cpt += 1 if self.first_time_fit: #It is the first window self.ensemble.fit(self.prec_window) self.first_time_fit = False elif (self.version == "SADWIN"): #if self.first_time_fit: # from skmultiflow.drift_detection.adwin import ADWIN # adwin = ADWIN() # self.first_time_fit = False #print('start sadwin version') #TODO MAJ Maurras 04112020 : Modify the way to detect the concept drift using the ADWIN() function availlable in scikitMultiflow #from skmultiflow.drift_detection.adwin import ADWIN #adwin = ADWIN() prec_window_scores = self.ensemble.anomaly_score( self.prec_window) #print('Before add element to adwin in SADWIN') #print(prec_window_scores) drift_detected = False #ind = 0 for score in prec_window_scores: #adwin.add_element(prec_window_scores) #print("added score = "+ str(score) + " on index = "+ str(ind)) #print('score[0]') #print(score[0]) #print('score') #print(score) self.adwin.add_element(score[0]) #print('start change detection') if self.adwin.detected_change(): #print('Change detected SADWIN') drift_detected = True #print("Index = "+str(i) +" of the window with data "+ str(self.prec_window[i])) break #ind = ind + 1 if (drift_detected): #print('start model updating') self.model_update.append(1) self.model_update_windows.append(self.samples_seen) self.update_model(self.prec_window) self.adwin.reset() else: self.model_update.append(0) self.model_update_windows.append(self.samples_seen) self.samples_seen += 1 def update_model(self, window): """ Update the model (fit a new isolation forest) if the current anomaly rate (in the previous sliding window) is higher than self.drift_threshold Parameters: window: numpy.ndarray of shape (self.window_size, n_features) Re-Initialize our attributes and our ensemble, fit with the current window """ ## ToDo ? Give a sample of self.window_size in attribute of iForest #MAJ Maurras 03112020 : No, Leave it like that. Must give all the window to tt construct the forest of itrees. self.is_learning_phase_on = True iforest = IsolationTreeEnsemble(self.window_size, self.n_estimators, self.random_state) self.ensemble = iforest self.ensemble.fit(window) #self.nb_update = self.nb_update + 1 print("") print( "The model was updated by training a new iForest with the version : " + self.version) def anomaly_scores_rate(self, window): """ Given a 2D matrix of observations, compute the anomaly rate for all instances in the window and return an anomaly rate of the given window. Parameters : window: numpy.ndarray of shape (self.window_size, n_features) """ score_tab = 2.0**(-1.0 * self.ensemble.path_length(window) / c(len(window))) score = 0 for x in score_tab: if x > self.anomaly_threshold: score += 1 return score / len(score_tab) ''' MAJ : 21112020 By : Maurras Add new function to classify instances (anomaly or normal) ''' def predict_simple(self, X): """ Given a window, Predict the instance class (1 or 0) by using predict_from_instances_scores on our model """ #print('predict_simple') prediction = self.ensemble.predict_from_instances_scores( self.ensemble.anomaly_score(X), self.anomaly_threshold) ## return prediction of all instances #print('end predict_simple') return prediction def predict(self, X): """ Given an instance, Predict the anomaly (1 or 0) based on the last sample of the window by using predict_proba if our model have fit, else return None """ if (self.samples_seen <= self.window_size): return [-1] ## Return the last element X = np.reshape(X, (1, len(X[0]))) self.prec_window = np.concatenate( (self.prec_window, X)) ## Append the instances in the sliding window prediction = self.ensemble.predict_from_anomaly_scores( self.predict_proba(self.prec_window), self.anomaly_threshold) ## return 0 or 1 return [prediction] def predict_proba(self, X): """ Calculate the anomaly score of the window if our model have fit, else return None Parameters : X: numpy.ndarray of shape (self.window_size, n_features) """ if (self.samples_seen <= self.window_size): return [-1] return self.ensemble.anomaly_score( self.prec_window )[-1] # Anomaly return an array with all scores of each data, taking -1 return the last instance (X) anomaly score
temp_drifts = [] df_results = pd.DataFrame({ 'y_true': results_dict['y_true'][-1], 'y_pred': results_dict['Predictions'][-1] }) df_results['Correct'] = (df_results['y_true'] == df_results['y_pred']) for i in range(df_results.shape[0]): adwin.add_element(df_results['Correct'].iloc[i]) if adwin.detected_change(): print('Change detected ADWIN in data: ' + str(df_results['Correct'].iloc[i]) + ' - at date: ' + str(results_dict['Date'][-1].iloc[i])) temp_drifts.append(results_dict['Date'][-1].iloc[i]) adwin.reset() if not temp_drifts: print('No Drift Detected - Predict next three months') start_test_date = start_test_date + pd.DateOffset(months=3) training_flag = False update_flag = False if temp_drifts: print('Drift detected - Choice on model') list_drift.append(temp_drifts[0]) start_train_date = temp_drifts[0] - pd.DateOffset(years=2) start_test_date = start_train_date + pd.DateOffset(years=2) if (temp_drifts[0] - datetime.date( xgboost_model.results['Training'][-1])) > timedelta(days=365):