def predict(self, X: Union[np.ndarray, list], drift_type: str = 'batch', return_p_val: bool = True, return_distance: bool = True) \ -> Dict[Dict[str, str], Dict[str, Union[np.ndarray, int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- X Batch of instances. drift_type Predict drift at the 'feature' or 'batch' level. For 'batch', the K-S statistics for each feature are aggregated using the Bonferroni or False Discovery Rate correction. return_p_val Whether to return feature level p-values. return_distance Whether to return the K-S statistic between the features of the new batch and reference data. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift predictions and both feature and batch level drift scores. 'data' contains the drift prediction and optionally the feature level p-values, threshold after multivariate correction if needed and K-S statistics. """ # compute drift scores p_vals, dist = self.score(X) # values below p-value threshold are drift if drift_type == 'feature': drift_pred = (p_vals < self.p_val).astype(int) elif drift_type == 'batch' and self.correction == 'bonferroni': threshold = self.p_val / self.n_features drift_pred = int((p_vals < threshold).any()) elif drift_type == 'batch' and self.correction == 'fdr': drift_pred, threshold = fdr(p_vals, q_val=self.p_val) else: raise ValueError( '`drift_type` needs to be either `feature` or `batch`.') # update reference dataset if (isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None and self.preprocess_X_ref): X = self.preprocess_fn(X) self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref) # used for reservoir sampling self.n += X.shape[0] # type: ignore # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_vals cd['data'][ 'threshold'] = self.p_val if drift_type == 'feature' else threshold if return_distance: cd['data']['distance'] = dist return cd
def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True, return_distance: bool = True, return_probs: bool = True, return_model: bool = True) \ -> Dict[str, Dict[str, Union[str, int, float, Callable]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- x Batch of instances. return_p_val Whether to return the p-value of the test. return_distance Whether to return a notion of strength of the drift. K-S test stat if binarize_preds=False, otherwise relative error reduction. return_probs Whether to return the instance level classifier probabilities for the reference and test data (0=reference data, 1=test data). return_model Whether to return the updated model trained to discriminate reference and test instances. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the p-value, performance of the classifier relative to its expectation under the no-change null, the out-of-fold classifier model prediction probabilities on the reference and test data, and the trained model. """ # compute drift scores p_val, dist, probs_ref, probs_test = self.score(x) drift_pred = int(p_val < self.p_val) # update reference dataset if isinstance( self.update_x_ref, dict ) and self.preprocess_fn is not None and self.preprocess_x_ref: x = self.preprocess_fn(x) # TODO: TBD: can `x` ever be a `list` after pre-processing? update_references and downstream functions # don't support list inputs and without the type: ignore[arg-type] mypy complains self.x_ref = update_reference( self.x_ref, x, self.n, self.update_x_ref) # type: ignore[arg-type] # used for reservoir sampling self.n += len(x) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist if return_probs: cd['data']['probs_ref'] = probs_ref cd['data']['probs_test'] = probs_test if return_model: cd['data']['model'] = self.model return cd
def predict(self, # type: ignore[override] x: Union[np.ndarray, list], c: np.ndarray, return_p_val: bool = True, return_distance: bool = True, return_coupling: bool = False) \ -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether a batch of data has drifted from the reference data, given the provided context. Parameters ---------- x Batch of instances. c Context associated with batch of instances. return_p_val Whether to return the p-value of the permutation test. return_distance Whether to return the conditional MMD test statistic between the new batch and reference data. return_coupling Whether to return the coupling matrices. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the p-value, threshold, conditional MMD test statistic and coupling matrices. """ # compute drift scores p_val, dist, distance_threshold, coupling = self.score(x, c) drift_pred = int(p_val < self.p_val) # update reference dataset if isinstance( self.update_ref, dict ) and self.preprocess_fn is not None and self.preprocess_x_ref: x = self.preprocess_fn(x) self.x_ref = update_reference( self.x_ref, x, self.n, self.update_ref) # type: ignore[arg-type] self.c_ref = update_reference( self.c_ref, c, self.n, self.update_ref) # type: ignore[arg-type] # used for reservoir sampling self.n += len(x) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist cd['data']['distance_threshold'] = distance_threshold if return_coupling: cd['data']['coupling_xx'] = coupling[0] cd['data']['coupling_yy'] = coupling[1] cd['data']['coupling_xy'] = coupling[2] return cd
def predict( self, X: np.ndarray, drift_type: str = "batch", return_p_val: bool = True ) -> Dict[Dict[str, str], Dict[str, np.ndarray]]: cd = concept_drift_dict() cd["data"]["is_drift"] = self.expect_return_is_drift return cd
def predict( self, X: np.ndarray, drift_type: str = "batch", return_p_val: bool = True ) -> Dict[Dict[str, str], Dict[str, np.ndarray]]: cd = concept_drift_dict() cd["data"]["is_drift"] = self.expect_return_is_drift cd["data"]["distance"] = [0.1, 0.2, 0.3] cd["data"]["p_val"] = [0.1, 0.2, 0.3] cd["data"]["threshold"] = 0.1 return cd
def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True, return_distance: bool = True) \ -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- x Batch of instances. return_p_val Whether to return the p-value of the permutation test. return_distance Whether to return the LSDD metric between the new batch and reference data. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the p-value, threshold and LSDD metric. """ # compute drift scores p_val, dist, dist_permutations = self.score(x) drift_pred = int(p_val < self.p_val) # compute distance threshold idx_threshold = int(self.p_val * len(dist_permutations)) distance_threshold = np.sort(dist_permutations)[::-1][idx_threshold] # update reference dataset if isinstance(self.update_x_ref, dict): if self.preprocess_fn is not None and self.preprocess_x_ref: x = self.preprocess_fn(x) x = self._normalize(x) # type: ignore elif self.preprocess_fn is None: x = self._normalize(x) # type: ignore else: pass self.x_ref = update_reference(self.x_ref, x, self.n, self.update_x_ref) # used for reservoir sampling self.n += len(x) # type: ignore # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist cd['data']['distance_threshold'] = distance_threshold return cd
def predict(self, x: Union[np.ndarray, list], return_p_val: bool = True, return_distance: bool = True, return_kernel: bool = True) \ -> Dict[Dict[str, str], Dict[str, Union[int, float, Callable]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- x Batch of instances. return_p_val Whether to return the p-value of the permutation test. return_distance Whether to return the MMD metric between the new batch and reference data. return_kernel Whether to return the updated kernel trained to discriminate reference and test instances. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the detector's metadata. 'data' contains the drift prediction and optionally the p-value, threshold, MMD metric and trained kernel. """ # compute drift scores p_val, dist, distance_threshold = self.score(x) drift_pred = int(p_val < self.p_val) # update reference dataset if isinstance( self.update_x_ref, dict ) and self.preprocess_fn is not None and self.preprocess_x_ref: x = self.preprocess_fn(x) self.x_ref = update_reference( self.x_ref, x, self.n, self.update_x_ref) # type: ignore[arg-type] # used for reservoir sampling self.n += len(x) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist cd['data']['distance_threshold'] = distance_threshold if return_kernel: cd['data']['kernel'] = self.kernel return cd
def predict( self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True, ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether the most recent window of data has drifted from the reference data. Parameters ---------- x_t A single instance to be added to the test-window. return_test_stat Whether to return the test statistic and threshold. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the test-statistic and threshold. """ self.t += 1 # preprocess if necessary if isinstance(self.preprocess_fn, Callable): # type: ignore x_t = x_t[None, :] if isinstance(x_t, np.ndarray) else [x_t] x_t = self.preprocess_fn(x_t)[0] # type: ignore # update test window and return updated test stat test_stat = self.score(x_t) threshold = self.get_threshold(self.t) drift_pred = 0 if test_stat is None else int(test_stat > threshold) self.test_stats = np.concatenate( [self.test_stats, np.array([test_stat])]) self.drift_preds = np.concatenate( [self.drift_preds, np.array([drift_pred])]) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred cd['data']['time'] = self.t cd['data']['ert'] = self.ert if return_test_stat: cd['data']['test_stat'] = test_stat cd['data']['threshold'] = threshold return cd
def predict( self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True, ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether the most recent window(s) of data have drifted from the reference data. Parameters ---------- x_t A single instance to be added to the test-window(s). return_test_stat Whether to return the test statistic and threshold. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the test-statistic and threshold. """ # Compute test stat and check for drift test_stats = self.score(x_t) thresholds = self.get_threshold( self.t - 1 ) # Note t-1 here, has we wish to use the unconditional thresholds drift_pred = self._check_drift(test_stats, thresholds) # Update results attributes self.test_stats = np.concatenate( [self.test_stats, test_stats[None, :, :]]) self.drift_preds = np.concatenate( [self.drift_preds, np.array([drift_pred])]) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred cd['data']['time'] = self.t cd['data']['ert'] = self.ert if return_test_stat: cd['data']['test_stat'] = test_stats cd['data']['threshold'] = thresholds return cd
def predict(self, x: np.ndarray, return_p_val: bool = True, return_distance: bool = True) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- x Batch of instances. return_p_val Whether to return the p-value of the test. return_distance Whether to return a notion of strength of the drift. K-S test stat if binarize_preds=False, otherwise relative error reduction. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the performance of the classifier relative to its expectation under the no-change null. """ # compute drift scores p_val, dist = self.score(x) drift_pred = int(p_val < self.p_val) # update reference dataset if isinstance(self.update_x_ref, dict) and self.preprocess_fn is not None and self.preprocess_x_ref: x = self.preprocess_fn(x) self.x_ref = update_reference(self.x_ref, x, self.n, self.update_x_ref) # used for reservoir sampling self.n += x.shape[0] # type: ignore # populate drift dict # TODO: add instance level feedback cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist return cd
def predict(self, X: Union[np.ndarray, list], return_p_val: bool = True, return_distance: bool = True) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- X Batch of instances. return_p_val Whether to return the p-value of the permutation test. return_distance Whether to return the MMD metric between the new batch and reference data. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the p-value, threshold and MMD metric. """ # compute drift scores p_val, dist = self.score(X) drift_pred = int(p_val < self.p_val) # update reference dataset if (isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None and self.preprocess_X_ref): X = self.preprocess_fn(X) self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref) # used for reservoir sampling self.n += X.shape[0] # type: ignore # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_val cd['data']['threshold'] = self.p_val if return_distance: cd['data']['distance'] = dist return cd
def predict( self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True, ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether the most recent window of data has drifted from the reference data. Parameters ---------- x_t A single instance to be added to the test-window. return_test_stat Whether to return the test statistic and threshold. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the test-statistic and threshold. """ # Compute test stat and check for drift test_stat = self.score(x_t) threshold = self.get_threshold(self.t) drift_pred = int(test_stat > threshold) self.test_stats = np.concatenate( [self.test_stats, np.array([test_stat])]) self.drift_preds = np.concatenate( [self.drift_preds, np.array([drift_pred])]) # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred cd['data']['time'] = self.t cd['data']['ert'] = self.ert if return_test_stat: cd['data']['test_stat'] = test_stat cd['data']['threshold'] = threshold return cd
def predict(self, X: Union[np.ndarray, list], return_metric: bool = True) \ -> Dict[Dict[str, str], Dict[str, Union[int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- X Batch of instances. return_metric Whether to return the drift metric from the detector. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift prediction and optionally the drift metric and threshold. """ # compute drift scores drift_metric = self.score(X) drift_pred = int(drift_metric > self.threshold) # update reference dataset if isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None and self.preprocess_X_ref: X = self.preprocess_fn(X) self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref) # used for reservoir sampling self.n += X.shape[0] # type: ignore # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_metric: cd['data'][self.metric_name] = drift_metric cd['data']['threshold'] = self.threshold return cd