def predict(self, X: Union[np.ndarray, list], drift_type: str = 'batch', return_p_val: bool = True, return_distance: bool = True) \ -> Dict[Dict[str, str], Dict[str, Union[np.ndarray, int, float]]]: """ Predict whether a batch of data has drifted from the reference data. Parameters ---------- X Batch of instances. drift_type Predict drift at the 'feature' or 'batch' level. For 'batch', the K-S statistics for each feature are aggregated using the Bonferroni or False Discovery Rate correction. return_p_val Whether to return feature level p-values. return_distance Whether to return the K-S statistic between the features of the new batch and reference data. Returns ------- Dictionary containing 'meta' and 'data' dictionaries. 'meta' has the model's metadata. 'data' contains the drift predictions and both feature and batch level drift scores. 'data' contains the drift prediction and optionally the feature level p-values, threshold after multivariate correction if needed and K-S statistics. """ # compute drift scores p_vals, dist = self.score(X) # values below p-value threshold are drift if drift_type == 'feature': drift_pred = (p_vals < self.p_val).astype(int) elif drift_type == 'batch' and self.correction == 'bonferroni': threshold = self.p_val / self.n_features drift_pred = int((p_vals < threshold).any()) elif drift_type == 'batch' and self.correction == 'fdr': drift_pred, threshold = fdr(p_vals, q_val=self.p_val) else: raise ValueError( '`drift_type` needs to be either `feature` or `batch`.') # update reference dataset if (isinstance(self.update_X_ref, dict) and self.preprocess_fn is not None and self.preprocess_X_ref): X = self.preprocess_fn(X) self.X_ref = update_reference(self.X_ref, X, self.n, self.update_X_ref) # used for reservoir sampling self.n += X.shape[0] # type: ignore # populate drift dict cd = concept_drift_dict() cd['meta'] = self.meta cd['data']['is_drift'] = drift_pred if return_p_val: cd['data']['p_val'] = p_vals cd['data'][ 'threshold'] = self.p_val if drift_type == 'feature' else threshold if return_distance: cd['data']['distance'] = dist return cd
def test_fdr(fdr_params): q_val, p_vals = fdr_params if p_vals['is_below'] and p_vals['p_val'].max() == 0: p_val = p_vals['p_val'] + q_val - 1e-5 elif not p_vals['is_below'] and p_vals['p_val'].max() == 0: p_val = p_vals['p_val'] + q_val else: p_val = p_vals['p_val'].copy() below_threshold = fdr(p_val, q_val) assert below_threshold == p_vals['is_below']
def test_fdr(fdr_params): q_val, p_vals = fdr_params if p_vals['is_below'] and p_vals['p_val'].max() == 0: p_val = p_vals['p_val'] + q_val - 1e-5 elif not p_vals['is_below'] and p_vals['p_val'].max() == 0: p_val = p_vals['p_val'] + q_val else: p_val = p_vals['p_val'].copy() below_threshold, thresholds = fdr(p_val, q_val) assert below_threshold == p_vals['is_below'] assert isinstance(thresholds, (np.ndarray, float))