def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) y = np.array([10, 0, 2]) y_ = np.array([4, 4, 4]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def test_isotonic_regression_oob_bad_after(): # Set y and x y = np.array([3, 7, 5, 9, 8, 7, 10]) x = np.arange(len(y)) # Create model and fit ir = IsotonicRegression(increasing="auto", out_of_bounds="raise") # Make sure that we throw an error for bad out_of_bounds value in transform ir.fit(x, y) ir.out_of_bounds = "xyz" msg = "The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got xyz" with pytest.raises(ValueError, match=msg): ir.transform(x)
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False, plot=False): """ Bootstrap isotonic calibration: * randomly divide data into train-test * on train isotonic is fitted and applyed to test * on test using calibrated probs p(B+) D2 and auc are calculated :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: D2 array and auc array """ aucs = [] D2_array = [] labels = (labels > threshold) * 1 for _ in range(n_calibrations): if group_column is not None: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group( group_column, probs, labels, weights, train_size=0.5) else: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split( probs, labels, weights, train_size=0.5) iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') if symmetrize: train_weights = 0.5*train_weights; iso_est.fit(numpy.r_[train_probs, 1-train_probs], numpy.r_[train_labels > 0, train_labels <= 0], numpy.r_[train_weights, train_weights]) else: iso_est.fit(train_probs, train_labels, train_weights) probs_calib = iso_est.transform(test_probs) if plot: plt.figure(1,figsize=(6,5)) plt.scatter(train_probs, train_labels, color='black', zorder=20) X_test = numpy.linspace(0.001,0.999,500) y_test = iso_est.transform(X_test) plt.plot(X_test, y_test, color='blue', linewidth=3) plt.show() alpha = (1 - 2 * probs_calib) ** 2 aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights)) D2_array.append(numpy.average(alpha, weights=test_weights)) return D2_array, aucs
class SavableIsotonicRegression(object): def __init__(self, origvals, nullvals, increasing, min_frac_neg=0.95): self.origvals = origvals self.nullvals = nullvals self.increasing = increasing self.min_frac_neg = min_frac_neg self.ir = IsotonicRegression( out_of_bounds='clip', increasing=increasing).fit( X=np.concatenate([self.origvals, self.nullvals], axis=0), y=([1.0 for x in self.origvals] + [0.0 for x in self.nullvals]), sample_weight=([1.0 for x in self.origvals] + [ float(len(self.origvals)) / len(self.nullvals) for x in self.nullvals ])) #Infer frac_pos based on the minimum value of the ir probs #See derivation in irval_to_probpos function min_prec_x = self.ir.X_min_ if self.increasing else self.ir.X_max_ min_precision = self.ir.transform([min_prec_x])[0] implied_frac_neg = -1 / (1 - (1 / max(min_precision, 1e-7))) print("For increasing =", increasing, ", the minimum IR precision was", min_precision, "occurring at", min_prec_x, "implying a frac_neg", "of", implied_frac_neg) if (implied_frac_neg > 1.0 or implied_frac_neg < self.min_frac_neg): implied_frac_neg = max(min(1.0, implied_frac_neg), self.min_frac_neg) print("To be conservative, adjusted frac neg is", implied_frac_neg) self.implied_frac_neg = implied_frac_neg def transform(self, vals): return irval_to_probpos(self.ir.transform(vals), frac_neg=self.implied_frac_neg) def save_hdf5(self, grp): grp.attrs['increasing'] = self.increasing grp.attrs['min_frac_neg'] = self.min_frac_neg grp.create_dataset('origvals', data=self.origvals) grp.create_dataset('nullvals', data=self.nullvals) @classmethod def from_hdf5(cls, grp): increasing = grp.attrs['increasing'] min_frac_neg = grp.attrs['min_frac_neg'] origvals = np.array(grp['origvals']) nullvals = np.array(grp['nullvals']) return cls(origvals=origvals, nullvals=nullvals, increasing=increasing, min_frac_neg=min_frac_neg)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0.0, y_max=1.0) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
class HTMLTime(object): """ >>> htmlTime = HTMLTime(pathToIDX) >>> t = htmlTime(frameNumber) """ def __init__(self, idx): super(HTMLTime, self).__init__() self.idx = idx # load .idx file using pandas df = read_table( self.idx, sep='\s+', names=['frame_number', 'frame_type', 'bytes', 'seconds'] ) x = np.array(df['frame_number'], dtype=np.float) y = np.array(df['seconds'], dtype=np.float) # train isotonic regression self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y)) self.ir.fit(x, y) # frame number support self.xmin = np.min(x) self.xmax = np.max(x) def __call__(self, frameNumber): return self.ir.transform([min(self.xmax, max(self.xmin, frameNumber) )])[0]
def test_isotonic_2darray_more_than_1_feature(): # Ensure IsotonicRegression raises error if input has more than 1 feature X = np.arange(10) X_2d = np.c_[X, X] y = np.arange(10) msg = "should be a 1d array or 2d array with 1 feature" with pytest.raises(ValueError, match=msg): IsotonicRegression().fit(X_2d, y) iso_reg = IsotonicRegression().fit(X, y) with pytest.raises(ValueError, match=msg): iso_reg.predict(X_2d) with pytest.raises(ValueError, match=msg): iso_reg.transform(X_2d)
def calibration_isotonic_regression(model_name, model, prob_model, X_calibration, y_calibration, X_train): # 1. function that trains the calibration regressor using as input calibration data in the first instance # 2. it then takes in the prob_out of the mdel on the test and outputs calibrated prob for further calculation of # calibrated std # ref: https: // arxiv.org / abs / 1807.00263 if model_name == 'Bayes_Ridge_model': y_hat_calibration, sem_hat_calibration = model.predict(X_calibration, return_std=True) elif model_name == 'RF_model': y_hat_calibration = model.predict(X_calibration) sem_hat_calibration = np.sqrt( fci.random_forest_error(model, X_train, X_calibration)) else: print('Error: Not able to calculate variace!') # y_hat, sem = model.predict(X_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval( y_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated
def test_isotonic_regression_ties_secondary_(): """ Test isotonic regression fit, transform and fit_transform against the "secondary" ties method and "pituitary" data from R "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods Set values based on pituitary example and the following R command detailed in the paper above: > library("isotone") > data("pituitary") > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary") > res1$x `isotone` version: 1.0-2, 2014-09-07 R version: R version 3.1.1 (2014-07-10) """ x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14] y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25] y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 24.25, 24.25] # Check fit, transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true, 4) assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
def test_isotonic_regression_ties_secondary_(): """ Test isotonic regression fit, transform and fit_transform against the "secondary" ties method and "pituitary" data from R "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair, Isotone Optimization in R: Pool-Adjacent-Violators Algorithm (PAVA) and Active Set Methods Set values based on pituitary example and the following R command detailed in the paper above: > library("isotone") > data("pituitary") > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary") > res1$x `isotone` version: 1.0-2, 2014-09-07 R version: R version 3.1.1 (2014-07-10) """ x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14] y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25] y_true = [ 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 24.25, 24.25 ] # Check fit, transform and fit_transform ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true, 4) assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
class IsotonicCalibrator(BaseEstimator, TransformerMixin): """ Calculates a likelihood ratio of a score value, provided it is from one of two distributions. Uses isotonic regression for interpolation. """ def __init__(self, add_one=False): self.add_one = add_one self._ir = IsotonicRegression() def fit(self, X, y, **fit_params): # prevent extreme LRs if ('add_one' in fit_params and fit_params['add_one']) or self.add_one: X = np.append(X, [1, 0]) y = np.append(y, [0, 1]) prior = np.sum(y) / y.size weight = y * (1 - prior) + (1 - y) * prior self._ir.fit(X, y, sample_weight=weight) return self def transform(self, X): self.p1 = self._ir.transform(X) self.p0 = 1 - self.p1 return to_odds(self.p1)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
def predict_probs(model, train_class, train_features, test_features, normalize_probs=None): """ Fit a given binary classification model to training sample features and return predicted probabilities for the positive class for the training and test samples. """ model.fit(train_features, train_class) train_prob, test_prob = [model.predict_proba(f)[:, 1] for f in (train_features, test_features)] if normalize_probs == "ROCSlope": # calibrate probabilities based on the estimated local slope # of the ROC curve chunk_size = 10 # number of instances for slope estimation n_train_pos = 301 # total number of positive (preictal) instances n_train_neg = 3766 # total negative (interictal) n_chunk_tot = 4000.0 / float(chunk_size) # estimated total in test data # sort training data classes by predicted probability sort_order = train_prob.argsort() p_sorted = train_prob[sort_order] c_sorted = train_class[sort_order] ix = np.array(range(len(train_prob))) # loop over chunks for i_ch in range(1 + (len(train_prob) - 1) / chunk_size): p_chunk, c_chunk = [ x[np.where((ix >= i_ch * chunk_size) & (ix < (i_ch + 1) * chunk_size))[0]] for x in (p_sorted, c_sorted) ] pmin = np.min(p_chunk) pmax = np.max(p_chunk) # compute TPR/FPR (relative to the entire training set) tpr = np.sum(c_chunk) / float(n_train_pos) fpr = np.sum(1 - c_chunk) / float(n_train_neg) # compute probability transformation for this chunk qc = (2.0 / np.pi) * np.arctan(tpr / (fpr + 1.0e-3 / float(n_train_neg))) qmin = np.max((0.0, qc - 0.5 / float(n_chunk_tot))) qmax = np.min((1.0, qc + 0.5 / float(n_chunk_tot))) # transform probabilities tr_p_ch = np.where((train_prob > pmin) & (train_prob <= pmax))[0] train_prob[tr_p_ch] = qmin + (train_prob[tr_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin) te_p_ch = np.where((test_prob > pmin) & (test_prob <= pmax))[0] test_prob[te_p_ch] = qmin + (test_prob[te_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin) elif normalize_probs == "LogShift": # shift probabilities in log(p/(1-p)) so that a fraction f_pre # of the samples has probability > 0.5, where f_pre is the # fraction of preictal samples in the training data f_pre = len(np.where(train_class)[0]) / float(len(train_class)) train_th, test_th = [sorted(p)[int((1.0 - f_pre) * len(p))] for p in (train_prob, test_prob)] train_prob, test_prob = [ (1.0 - pth) * p / (pth + p - 2.0 * pth * p) for (pth, p) in zip((train_th, test_th), (train_prob, test_prob)) ] elif normalize_probs == "IsoReg": # fit an isotonic regression model to training probabilities # and use the model to transform all probabilities prob_model = IsotonicRegression(out_of_bounds="clip") prob_model.fit(train_prob, train_class) train_prob, test_prob = [prob_model.transform(p) for p in (train_prob, test_prob)] elif normalize_probs is not None: sys.exit("Invalid value of normalize_probs:", str(normalize_probs)) return (train_prob, test_prob)
def test_assert_raises_exceptions(): ir = IsotonicRegression() rng = np.random.RandomState(42) msg = "Found input variables with inconsistent numbers of samples" with pytest.raises(ValueError, match=msg): ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6]) with pytest.raises(ValueError, match=msg): ir.fit([0, 1, 2], [5, 7]) msg = 'X should be a 1d array' with pytest.raises(ValueError, match=msg): ir.fit(rng.randn(3, 10), [0, 1, 2]) msg = 'Isotonic regression input X should be a 1d array' with pytest.raises(ValueError, match=msg): ir.transform(rng.randn(3, 10))
class IsotonicCalibrator(BaseEstimator, TransformerMixin): """ Calculates a likelihood ratio of a score value, provided it is from one of two distributions. Uses isotonic regression for interpolation. """ def __init__(self, add_one=False, add_misleading=0): """ Arguments: add_one: deprecated (same as add_misleading=1) add_misleading: int: add misleading data points on both sides (default: 0) """ if add_one: warnings.warn( 'parameter `add_one` is deprecated; use `add_misleading=1` instead' ) self.add_misleading = (1 if add_one else 0) + add_misleading self._ir = IsotonicRegression() def fit(self, X, y, **fit_params): # prevent extreme LRs if 'add_misleading' in fit_params: n_misleading = fit_params['add_misleading'] elif 'add_one' in fit_params: warnings.warn( 'parameter `add_one` is deprecated; use `add_misleading=1` instead' ) n_misleading = 1 if fit_params['add_one'] else 0 else: n_misleading = self.add_misleading if n_misleading > 0: X = np.concatenate([ X, np.ones(n_misleading) * (X.max() + 1), np.ones(n_misleading) * (X.min() - 1) ]) y = np.concatenate( [y, np.zeros(n_misleading), np.ones(n_misleading)]) prior = np.sum(y) / y.size weight = y * (1 - prior) + (1 - y) * prior self._ir.fit(X, y, sample_weight=weight) return self def transform(self, X): self.p1 = self._ir.transform(X) self.p0 = 1 - self.p1 return to_odds(self.p1)
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, threshold=0., symmetrize=False): """ Bootstrap isotonic calibration (borrowed from tata-antares/tagging_LHCb): * randomly divide data into train-test * on train isotonic is fitted and applyed to test * on test using calibrated probs p(B+) D2 and auc are calculated :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1j :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: D2 array and auc array """ import numpy as np from sklearn.isotonic import IsotonicRegression from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score aucs = [] D2_array = [] labels = (labels > threshold) * 1 for _ in range(n_calibrations): (train_probs, test_probs, train_labels, test_labels, train_weights, test_weights) = train_test_split(probs, labels, weights, train_size=0.5) iso_reg = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') if symmetrize: iso_reg.fit(np.r_[train_probs, 1 - train_probs], np.r_[train_labels > 0, train_labels <= 0], np.r_[train_weights, train_weights]) else: iso_reg.fit(train_probs, train_labels, train_weights) probs_calib = iso_reg.transform(test_probs) alpha = (1 - 2 * probs_calib)**2 aucs.append( roc_auc_score(test_labels, test_probs, sample_weight=test_weights)) D2_array.append(np.average(alpha, weights=test_weights)) return np.array(D2_array), np.array(aucs)
class IsotonicCalibration(BaseEstimator, TransformerMixin): """ Построение модели изотонической регресии на наблюдениях: y_pred -> y_target """ def __init__(self): self.calibration = IsotonicRegression(out_of_bounds="clip") def fit(self, y_pred: pd.Series, y_true: pd.Series): self.calibration.fit(y_pred, y_true) return self def transform(self, y_pred): return self.calibration.transform(y_pred)
def cal(self, mod_rootdir=None, model_dirpaths=None, example_dirname='clean_example_data', n_samples=100): """ Implements calibration :param mod_rootdir: directory containing a bunch of model directories to be used for calibration. Either this or model_dirpaths should be set. :param model_dirpaths: list of model directories to be used for calibration. Either this or mod_rootdir should be set. :param example_dirname: name of the (clean) example data directory in each model directory :param n_samples: number of noisy samples of each data point :return: numpy array of calibrated probabilities, ordered like the model_dirpaths (or sorted directories in mod_rootdir) """ assert (mod_rootdir is not None) != (model_dirpaths is not None), "set either mod_rootdir or model_dirpaths" if model_dirpaths is None: print("deprecation warning: using mod_rootdir is deprecated in favor of explicitly setting model_dirpaths") model_dirpaths = utils.get_modeldirs(mod_rootdir) # get the data for calibration mags = self.get_cal_data(model_dirpaths, example_dirname, n_samples=n_samples) mags = mags.reshape(-1) y = np.array([utils.get_class(os.path.join(pth, 'config.json'), classtype='binary', file=True) for pth in model_dirpaths]) if n_samples is not None: y = y.reshape(-1, 1) * np.ones([1, n_samples]) y = y.reshape(-1) # check for saved model irpath = self.get_irpath() if os.path.exists(irpath) and not self.overwrite: ir_model = joblib.load(irpath) else: # run the calibration & save model ir_model = IsotonicRegression(out_of_bounds='clip') clippedmags = np.clip(mags, np.percentile(mags, 10), np.percentile(mags, 90)) # clippedmags = np.clip(mags, np.percentile(mags, 25), np.percentile(mags, 75)) ir_model.fit(clippedmags, y) joblib.dump(ir_model, irpath) # get & return the calibrated probabilities pcal = ir_model.transform(mags) return pcal
class LLRIsotonicRegression(LLR): """Log-likelihood ratio estimation by isotonic regression""" def __init__(self, equal_priors=False): super(LLRIsotonicRegression, self).__init__() self.equal_priors = equal_priors def fit(self, X, Y): self.prior = self._get_prior(X, Y) scores, ratios = self._get_scores_ratios(X, Y) y_min = np.min(ratios) y_max = np.max(ratios) self.ir = IsotonicRegression(y_min=y_min, y_max=y_max) self.ir.fit(scores, ratios) return self def toLogLikelihoodRatio(self, scores): """Get log-likelihood ratio given scores Parameters ---------- scores : numpy array Test scores Returns ------- llr : numpy array Log-likelihood ratio array with same shape as input `scores` """ x_min = np.min(self.ir.X_) x_max = np.max(self.ir.X_) oob_min = np.where(scores < x_min) oob_max = np.where(scores > x_max) ok = np.where((scores >= x_min) * (scores <= x_max)) calibrated = np.zeros(scores.shape) calibrated[ok] = self.ir.transform(scores[ok]) calibrated[oob_min] = self.ir.y_min calibrated[oob_max] = self.ir.y_max return calibrated
def calibration_isotonic_regression(data_calibration, prob_model): # calibration function y_true_calibration, y_hat_calibration, sem_hat_calibration = predict_w_DNN( data_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval( y_true_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_true_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated
class IsotonicRecalibrator(): def __init__(self, c, device): self.c = c self.ir = IR(out_of_bounds='clip') self.device = device def fit(self, output, label): x = output[:, self.c, :, :].reshape(-1).data.cpu().numpy().astype( np.float) y = (label == self.c).reshape(-1).data.cpu().numpy().astype(np.float) self.ir.fit(x, y) def predict(self, x): shape = x.shape x = x.reshape(-1).data.cpu().numpy().astype(np.float) return torch.tensor(self.ir.transform(x), device=self.device, dtype=torch.float).reshape(shape)
class IDXHack(object): """ Usage ===== >>> from mediaeval_util.repere import IDXHack >>> frame2time = IDXHack(args['--idx']) >>> trueTime = frame2time(opencvFrame, opencvTime) """ def __init__(self, idx=None): super(IDXHack, self).__init__() self.idx = idx if self.idx: # load .idx file using pandas df = read_table( self.idx, sep='\s+', names=['frame_number', 'frame_type', 'bytes', 'seconds'] ) x = np.array(df['frame_number'], dtype=np.float) y = np.array(df['seconds'], dtype=np.float) # train isotonic regression self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y)) self.ir.fit(x, y) # frame number support self.xmin = np.min(x) self.xmax = np.max(x) def __call__(self, opencvFrame, opencvTime): if self.idx is None: return opencvTime return self.ir.transform([min(self.xmax, max(self.xmin, opencvFrame) )])[0]
def calibrate_probabilities(prob_dict,instance_label_dict): labels = [] probabilities = [] print(len(prob_dict)) print(len(instance_label_dict)) for i in prob_dict: labels.append(instance_label_dict[i]) probabilities.append(prob_dict[i]) ir = IR(out_of_bounds='clip') ir.fit(probabilities,labels) #fit ir to abstract level precision and classes p_calibrated=ir.transform(probabilities) fig,ax = plt.subplots() fraction_of_positives, mean_predicted_value = calibration_curve(labels, p_calibrated, n_bins=10) ax.plot(mean_predicted_value, fraction_of_positives) fraction_of_positives, mean_predicted_value = calibration_curve(labels, probabilities, n_bins=10) ax.plot(mean_predicted_value, fraction_of_positives) plt.savefig('calibration_curve_on_data.png') return ir
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, threshold=0., symmetrize=False): """ Bootstrap isotonic calibration (borrowed from tata-antares/tagging_LHCb): * randomly divide data into train-test * on train isotonic is fitted and applyed to test * on test using calibrated probs p(B+) D2 and auc are calculated :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1j :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: D2 array and auc array """ aucs = [] D2_array = [] labels = (labels > threshold) * 1 for _ in range(n_calibrations): (train_probs, test_probs, train_labels, test_labels, train_weights, test_weights) = train_test_split( probs, labels, weights, train_size=0.5) iso_reg = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') if symmetrize: iso_reg.fit(np.r_[train_probs, 1-train_probs], np.r_[train_labels > 0, train_labels <= 0], np.r_[train_weights, train_weights]) else: iso_reg.fit(train_probs, train_labels, train_weights) probs_calib = iso_reg.transform(test_probs) alpha = (1 - 2 * probs_calib) ** 2 aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights)) D2_array.append(np.average(alpha, weights=test_weights)) return np.array(D2_array), np.array(aucs)
def test_isotonic_regression_with_ties_in_differently_sized_groups(): """ Non-regression test to handle issue 9432: https://github.com/scikit-learn/scikit-learn/issues/9432 Compare against output in R: > library("isotone") > x <- c(0, 1, 1, 2, 3, 4) > y <- c(0, 0, 1, 0, 0, 1) > res1 <- gpava(x, y, ties="secondary") > res1$x `isotone` version: 1.1-0, 2015-07-24 R version: R version 3.3.2 (2016-10-31) """ x = np.array([0, 1, 1, 2, 3, 4]) y = np.array([0, 0, 1, 0, 0, 1]) y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.]) ir = IsotonicRegression() ir.fit(x, y) assert_array_almost_equal(ir.transform(x), y_true) assert_array_almost_equal(ir.fit_transform(x, y), y_true)
def get_mapping(counts, lengths, bs=None, smoothed=True, verbose=False): if verbose: print("Computing relationship genomic distance & expected counts") if sparse.issparse(counts): gdis, means = _get_mapping_sparse(counts, lengths, bs) else: gdis, means = _get_mapping_dense(counts, lengths, bs) if not smoothed: return np.array([gdis, means]) if verbose: print("Fitting Isotonic Regression") from sklearn.isotonic import IsotonicRegression ir = IsotonicRegression(increasing=False, out_of_bounds="clip") if gdis.min() > 0: y = np.array(means).flatten() x = np.arange(y.shape[0]) elif gdis.min() == 0: y = np.array(means)[1:].flatten() x = np.arange(y.shape[0]) else: y = np.array(means)[2:].flatten() x = np.arange(y.shape[0]) mask = np.invert(np.isnan(y) | np.isinf(y) | (y == 0)) ir.fit(x[mask], y[mask]) means_fitted = ir.transform(x) if gdis.min() < 0: expected_counts = np.concatenate([[means[0], 0], means_fitted]) elif gdis.min() == 0: expected_counts = np.concatenate([[0], means_fitted]) else: expected_counts = means_fitted return np.array([gdis, expected_counts])
class IsotonicCalibrator(BaseEstimator, TransformerMixin): """ Calculates a likelihood ratio of a score value, provided it is from one of two distributions. Uses isotonic regression for interpolation. """ def __init__(self, add_one=False): self.add_one = add_one self._ir = IsotonicRegression() def fit(self, X, y, **fit_params): X0, X1 = Xy_to_Xn(X, y) # prevent extreme LRs if ('add_one' in fit_params and fit_params['add_one']) or self.add_one: X0 = np.append(X0, 1) X1 = np.append(X1, 0) X0n = X0.shape[0] X1n = X1.shape[0] X, y = Xn_to_Xy(X0, X1) weight = np.concatenate([[X1n] * X0n, [X0n] * X1n]) self._ir.fit(X, y, sample_weight=weight) return self def transform(self, X): if isinstance(X, np.matrix): X = X.A1 posterior = self._ir.transform(X) self.p0 = (1 - posterior) self.p1 = posterior with np.errstate(divide='ignore'): return self.p1 / self.p0
# print("fold: " + str(j)) idx0 = xfolds[xfolds.fold5 != j + 1].index idx1 = xfolds[xfolds.fold5 == j + 1].index x0 = xtrain[xtrain.index.isin(idx0)] x1 = xtrain[xtrain.index.isin(idx1)] y0 = y[y.index.isin(idx0)] y1 = y[y.index.isin(idx1)] y_raw = np.array(x1)[:,wfold] storage_mat[j,0] = log_loss(y1, y_raw) ymat_valid[idx1,0] = y_raw # fit an isotonic regression for iso scaling ir = IR( out_of_bounds = 'clip' ) ir.fit( np.array(x0)[:,wfold], y0 ) y_iso = ir.transform((np.array(x1)[:,0])) storage_mat[j,1] = log_loss(y1, y_iso) ymat_valid[idx1,1] = y_iso storage_mat[j,7] = log_loss(y1, y_iso + y0.mean() - y_iso.mean()) ymat_valid[idx1,7] = y_iso + y0.mean() - y_iso.mean() # fit a logistic regression for Platt scaling lr = LR(C = c_val) lr.fit( np.array(x0)[:,0].reshape( -1, 1 ), y0 ) y_platt = lr.predict_proba(np.array(x1)[:,0].reshape(-1,1))[:,1] storage_mat[j,2] = log_loss(y1, y_platt) ymat_valid[idx1,2] = y_platt storage_mat[j,8] = log_loss(y1, y_platt + y0.mean() - y_platt.mean()) ymat_valid[idx1,8] = y_platt + y0.mean() - y_platt.mean() y_ri = 0.5 * (y_raw + y_iso)
p_train_all = read_csv(trainResult)['prob'] oriTrain = read_csv('../data/train.csv') sameTrain = oriTrain[oriTrain['clickTime'] >= 190000].reset_index() print len(sameTrain), len(p_train_all) part_sameTrain = sameTrain[(sameTrain['clickTime'] >= 200000) & (sameTrain['clickTime'] < 290000)] p_train = p_train_all.loc[part_sameTrain.index] y_train = part_sameTrain['label'] ir = IR() ir.fit(p_train, y_train) oriResult = read_csv( '../data/calibration/ffm_mergeAppUser_s17_preAction_190000_no_Dist_noNum_t150_k8_l2e-05_2017-06-05-20-58-00.csv' ) p_test = oriResult['prob'] p_calibrated = ir.transform( p_test) # or ir.fit( p_test ), that's the same thing oriResult['new_prob'] = Series(p_calibrated) oriResult.to_csv('../data/calibration/calib_temp.csv', index=False) oriResult['nozero_new_prob'] = oriResult.apply( lambda x: x['new_prob'] if x['new_prob'] > 0 else x['prob'], axis='columns') del oriResult['prob'], oriResult['new_prob'] oriResult.rename(columns={'nozero_new_prob': 'prob'}, inplace=True) oriResult.to_csv('../data/calibration/calib_submit.csv', index=False)
class InterpolatedIsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin): """Interpolated Isotonic Regression model. apply linear interpolation to transform piecewise constant isotonic regression model into piecewise linear model """ def __init__(self, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'): self.y_min = y_min self.y_max = y_max self.increasing = increasing self.out_of_bounds = out_of_bounds def fit(self, X, y, sample_weight=None): """Fit the model using X, y as training data. Parameters ---------- X : array-like, shape=(n_samples,) Training data. y : array-like, shape=(n_samples,) Training target. sample_weight : array-like, shape=(n_samples,), optional, default: None Weights. If set to None, all weights will be set to 1 (equal weights). Returns ------- self : object Returns an instance of self. Notes ----- X is stored for future use, as `transform` needs X to interpolate new input data. """ self.iso_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max, increasing=self.increasing, out_of_bounds=self.out_of_bounds) self.iso_.fit(X, y, sample_weight=sample_weight) p = self.iso_.transform(X) change_mask1 = (p - np.roll(p, 1)) > 0 change_mask2 = np.roll(change_mask1, -1) change_mask1[0] = True change_mask1[-1] = True change_mask2[0] = True change_mask2[-1] = True self.iso_interp1_ = interp1d(X[change_mask1], p[change_mask1], bounds_error=False, fill_value=(0., 1.)) self.iso_interp2_ = interp1d(X[change_mask2], p[change_mask2], bounds_error=False, fill_value=(0., 1.)) return self def transform(self, T): """Transform new data by linear interpolation Parameters ---------- T : array-like, shape=(n_samples,) Data to transform. Returns ------- T_ : array, shape=(n_samples,) The transformed data """ return 0.5 * (self.iso_interp1_(T) + self.iso_interp2_(T)) def predict(self, T): """Predict new data by linear interpolation. Parameters ---------- T : array-like, shape=(n_samples,) Data to transform. Returns ------- T_ : array, shape=(n_samples,) Transformed data. """ return self.transform(T)
# train/test split (in half) train_end = y.shape[0] / 2 test_start = train_end + 1 y_train = y[0:train_end] y_test = y[test_start:] p_train = p[0:train_end] p_test = p[test_start:] ### ir = IR(out_of_bounds="clip") # out_of_bounds param needs scikit-learn >= 0.15 ir.fit(p_train, y_train) p_calibrated = ir.transform(p_test) p_calibrated[np.isnan(p_calibrated)] = 0 ### acc = accuracy_score(y_test, np.round(p_test)) acc_calibrated = accuracy_score(y_test, np.round(p_calibrated)) auc = AUC(y_test, p_test) auc_calibrated = AUC(y_test, p_calibrated) ll = log_loss(y_test, p_test) ll_calibrated = log_loss(y_test, p_calibrated) print "accuracy - before/after:", acc, "/", acc_calibrated
class IsotonicCalibrator(BaseEstimator, RegressorMixin): """Probability calibration with isotonic regression. Note ---- This class backports and extends `sklearn.isotonic.IsotonicRegression`. """ def __init__(self, y_min=None, y_max=None, increasing=True, interpolation=False): """Constructor. Parameters ---------- * `y_min` [optional]: If not `None`, set the lowest value of the fit to `y_min`. * `y_max` [optional]: If not `None`, set the highest value of the fit to `y_max`. * `increasing` [boolean or string, default=`True`]: If boolean, whether or not to fit the isotonic regression with `y` increasing or decreasing. The string value `"auto"` determines whether `y` should increase or decrease based on the Spearman correlation estimate's sign. * `interpolation` [boolean, default=`False`]: Whether linear interpolation is enabled or not. """ self.y_min = y_min self.y_max = y_max self.increasing = increasing self.interpolation = interpolation def fit(self, T, y, sample_weight=None): """Fit using `T`, `y` as training data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Training data. * `y` [array-like, shape=(n_samples,)]: Training target. * `sample_weight` [array-like, shape=(n_samples,), optional]: Weights. If set to None, all weights will be set to 1. Returns ------- * `self` [object]: `self`. Notes ----- `T` is stored for future use, as `predict` needs T to interpolate new input data. """ # Check input T = column_or_1d(T) # Fit isotonic regression self.ir_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max, increasing=self.increasing, out_of_bounds="clip") self.ir_.fit(T, y, sample_weight=sample_weight) # Interpolators if self.interpolation: p = self.ir_.transform(T) change_mask1 = (p - np.roll(p, 1)) > 0 change_mask2 = np.roll(change_mask1, -1) change_mask1[0] = True change_mask1[-1] = True change_mask2[0] = True change_mask2[-1] = True self.interp1_ = interp1d(T[change_mask1], p[change_mask1], bounds_error=False, fill_value=(0., 1.)) self.interp2_ = interp1d(T[change_mask2], p[change_mask2], bounds_error=False, fill_value=(0., 1.)) return self def predict(self, T): """Calibrate data. Parameters ---------- * `T` [array-like, shape=(n_samples,)]: Data to calibrate. Returns ------- * `Tt` [array, shape=(n_samples,)]: Calibrated data. """ if self.interpolation: T = column_or_1d(T) return 0.5 * (self.interp1_(T) + self.interp2_(T)) else: return self.ir_.transform(T)
class CalibratedRegression: def __init__(self, X, y, model, cal_prop=0.2, cdf_method='bayesian', pp=None, pp_params=None): '''Initializes the class Parameters ---------- X : np.array Data X y : np.array Data y model : pymc3 model or sklearn or statsmodels model The model to be calibrated cal_prop : float, optional, default: None The proportion of the training set to be used to make the calibration set cdf_method : string, optional, default: 'bayesian' Whether it is a Bayesian model, statsmodel or sklearn model Must be 'bayesian', 'bootstrap' or 'statsmodels' pp : function, optional, default: None The function to calculate the posterior predictive. Must return a numpy array. pp_params : dict, default: None Any additional parameters to be passed into the posterior predictive function. ''' # data self.X = X self.y = y # model self.model = model self.posterior_predictive = pp self.pp_params = pp_params # calibration features self.calibration_dataset = None self.isotonic = None # split up training and calibration sets self.X_train, self.X_cal, self.y_train, self.y_cal = train_test_split( X, y, test_size=cal_prop) if cdf_method in ['bayesian', 'bootstrap', 'statsmodels']: self.cdf_method = cdf_method else: raise ValueError( "cdf_method must be of type 'bayesian', 'bootstrap', or 'statsmodels'" ) def bootstrap(): '''Utility function to bootstrap.''' pass def fit(self): '''Fit underlying model Creates the calibration dataset and fits an IsotonicRegression on this dataset Returns ------- self : CalibratedRegression object Returns a fit instance of the CalibratedRegression class ''' if self.cdf_method == 'bayesian': # there should be a posterior_predictive function assert self.posterior_predictive is not None and self.pp_params is not None # call the posterior predictive function self.posterior_predictive_cal = self.posterior_predictive( self.X_cal, **self.pp_params) elif self.cdf_method == 'bootstrap': # get CDF from bootstrapping pass elif self.cdf_method == 'statsmodels': # get CDF from statsmodels pass # create the calibration dataset self.calibration_dataset, self.predicted_cdf, self.empirical_cdf = self.create_calibration_dataset( ) # fit the isotonic regression self.isotonic = IsotonicRegression(out_of_bounds='clip') self.isotonic.fit(self.empirical_cdf, self.predicted_cdf) return self def create_calibration_dataset(self, X=None, y=None, pp=None, pp_params=None): '''Creates a Pandas dataframe which has the calibration dataset Parameters ---------- X : np.array, optional, default: None Data X. Uses self.X_cal if None y : np.array, optional, default: None Data y. Uses self.y_cal if None pp : function, optional, default: None The function to calculate the posterior predictive. Must return a numpy array. Uses self.posterior_predictive if None pp_params : dict, default: None Any additional parameters to be passed into the posterior predictive function. Uses self.pp_params if None Returns ------- calibration_dataset : Pandas dataframe THis contains X, y, predicted_cdf and empirical_cdf ''' # check conditions X = X if X is not None else self.X_cal y = y if y is not None else self.y_cal pp = pp if pp is not None else self.posterior_predictive pp_params = pp_params if pp_params is not None else self.pp_params post_pred = pp(X, **pp_params) predicted_cdf = self.pcdf(post_pred, y) # predicted CDF empirical_cdf = self.ecdf(predicted_cdf) # empirical CDF # putting results in a Pandas dataframe calibration_dataset = pd.DataFrame({ 'X': X, 'y': y, 'predicted_cdf': predicted_cdf, 'empirical_cdf': empirical_cdf }) return calibration_dataset[[ 'X', 'y', 'predicted_cdf', 'empirical_cdf' ]], predicted_cdf, empirical_cdf def predict(self, X_test, y_pred, quantiles): '''Return point estimates and PIs. Parameters ---------- X_test : np.array Test data y_pred : np.array The predictions made by the model quantiles : list List of floats between 0 and 1 to be calibrated. Example: [0.05, 0.5, 0.95] Returns ------- posterior_predictive_test : np.array Posterior predictive samples for the test data new_quantiles : list List of new floats, also between 0 and 1, that are the calibrated version of the input quantiles ''' assert self.isotonic is not None, 'Call fit() first' new_quantiles = self.predict_quantiles(quantiles) # saving variables self.X_test = X_test self.y_pred = y_pred self.posterior_predictive_test = self.posterior_predictive( X_test, **self.pp_params) # returning quantiles return self.posterior_predictive_test, new_quantiles def predict_quantiles(self, quantiles): '''Returns transformed quantiles according to the isotonic regression model Parameters ---------- quantiles : list List of floats between 0 and 1 to be calibrated. Example: [0.05, 0.5, 0.95] Returns ------- quantiles_ : list List of new floats, also between 0 and 1, that are the calibrated version of the input quantiles ''' assert self.isotonic is not None, 'Call fit() first' return self.isotonic.transform(quantiles) def pcdf(self, post_pred, y): '''Gets Predicted CDF Gets the predicted cdf, also represented as H(x_t)(y_t) in the paper. Parameters ---------- post_pred : np.array Posterior predictive samples generated by the model (at a particular quantile). y : np.array The true data Returns ------- pcdf_ : np.array The predicted cdf ''' return np.mean(post_pred <= y.reshape(-1, 1), axis=1) def ecdf(self, predicted_cdf): '''Empirical CDF. Gets the empirical cdf, also represented as $\hat{P}[H(x_t)(y_t)]$ in the paper. Counts how many points in the dataset have a pcdf <= to the pcdf of a point for all points in the dataset. Parameters ---------- predicted_cdf : np.array Predicted cdf. Can be generated by calling self.pcdf for posterior predictive samples at a particular quantile. Returns ------- ecdf_ : np.array The empirical cdf ''' empirical_cdf = np.zeros(len(predicted_cdf)) for i, p in enumerate(predicted_cdf): empirical_cdf[i] = np.sum(predicted_cdf <= p) / len(predicted_cdf) return empirical_cdf def plot_calibration_curve(self, ax): '''Plot calibration curve as described in paper (figure 3b). Parameters ---------- ax : matplotlib axis object Axis to plot on Returns ------- ax : matplotlib axis object Axis after it has been plotted on ''' assert self.empirical_cdf is not None, 'Call fit() first' ax.scatter(self.predicted_cdf, self.empirical_cdf, alpha=0.7) ax.plot([0, 1], [0, 1], '--', color='grey', label='Perfect calibration') ax.set_xlabel('Predicted', fontsize=17) ax.set_ylabel('Empirical', fontsize=17) ax.set_title('Predicted CDF vs Empirical CDF', fontsize=17) ax.legend(fontsize=17) return ax def plot_diagnostic_curve(self, ax, X_test, y_test): '''Plot diagnostic curve as described in paper (figure 3c). Parameters ---------- ax : matplotlib axis object Axis to plot on X_test : np.array Test data (X) y_test : np.array Test data (y). These are the predictions that need to be calibrated. Returns ------- ax : matplotlib axis object Axis after it has been plotted on ''' conf_level_lower_bounds = np.arange(start=0.025, stop=0.5, step=0.025) conf_levels = 1 - 2 * conf_level_lower_bounds unc_pcts = [] cal_pcts = [] for cl_lower in conf_level_lower_bounds: quants = [cl_lower, 1 - cl_lower] post_pred_test, new_quantiles = self.predict( X_test, y_test, quants) cal_lower, cal_upper = np.quantile(post_pred_test, new_quantiles, axis=1) unc_lower, unc_upper = np.quantile(post_pred_test, quants, axis=1) perc_within_unc = np.mean((y_test <= unc_upper) & (y_test >= unc_lower)) perc_within_cal = np.mean((y_test <= cal_upper) & (y_test >= cal_lower)) unc_pcts.append(perc_within_unc) cal_pcts.append(perc_within_cal) ax.plot([0, 1], [0, 1], '--', color='grey') ax.plot(conf_levels, unc_pcts, '-o', color='purple', label='uncalibrated') ax.plot(conf_levels, cal_pcts, '-o', color='red', label='calibrated') ax.legend(fontsize=14) ax.set_title('Diagnostic Plot', fontsize=17) ax.set_xlabel('Predicted Confidence Level', fontsize=17) ax.set_ylabel('Observed Confidence Level', fontsize=17) return ax def plot_intervals(self, ax, X_test, y_test, quantiles=[0.05, 0.5, 0.95]): '''Plot uncalibrated and calibrated predictive intervals. Parameters ---------- ax : matplotlib axis object Axis to plot on X_test : np.array Test data (X) y_test : np.array Test data (y). These are the predictions that need to be calibrated. quantiles : list, optional, default=[0.05, 0.5, 0.95] List of floats between 0 and 1 to be calibrated. Returns ------- ax : matplotlib axis object Axis after it has been plotted on ''' assert len(ax) == 2, 'Need to provide two axes' post_pred_test, new_quantiles = self.predict(X_test, y_test, quantiles) cal_lower, cal_median, cal_upper = np.quantile(post_pred_test, new_quantiles, axis=1) unc_lower, unc_median, unc_upper = np.quantile(post_pred_test, quantiles, axis=1) perc_within_unc = np.mean((y_test <= unc_upper) & (y_test >= unc_lower)) perc_within_cal = np.mean((y_test <= cal_upper) & (y_test >= cal_lower)) ax[0].plot(X_test, y_test, 'o', color='black', alpha=0.2, markersize=3) ax[0].set_title( f'Uncalibrated: {100*perc_within_unc:.2f}% of the test points within {round((1-2*quantiles[0])*100)}% interval', fontsize=17) ax[0].set_xlabel('X', fontsize=17) ax[0].set_ylabel('y', fontsize=17) ax[0].fill_between(X_test, unc_lower, unc_upper, color='green', alpha=0.2) ax[0].plot( X_test, unc_median, label=f'Median. MSE={mean_squared_error(y_test, unc_median):.2f}') ax[0].legend(fontsize=17) ax[1].plot(X_test, y_test, 'o', color='black', alpha=0.2, markersize=3) ax[1].set_title( f'Calibrated: {100*perc_within_cal:.2f}% of the test points within {round((1-2*quantiles[0])*100)}% interval', fontsize=17) ax[1].set_xlabel('X', fontsize=17) ax[1].set_ylabel('y', fontsize=17) ax[1].fill_between(X_test, cal_lower, cal_upper, color='yellow', alpha=0.2) ax[1].plot( X_test, cal_median, label=f'Median. MSE={mean_squared_error(y_test, cal_median):.2f}') ax[1].legend(fontsize=17) return ax, (cal_lower, cal_median, cal_upper), (unc_lower, unc_median, unc_upper)
def calculate_probability_distribution(tree , instances , index , cal_method =None): if cal_method == None : return tree.distribution_for_instance(instances.get_instance(index)) elif cal_method == 'Platt' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ (dist[1] - 0.5)*2.0 ] y_train[i] = [instance.get_value(instance.class_index)] # print("p_train ====>>>" , p_train) # print("y_train ====>>>" , y_train) dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0 tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : warnings.filterwarnings("ignore", category=FutureWarning) lr = LR(solver='lbfgs') lr.fit( p_train , np.ravel(y_train,order='C') ) return lr.predict_proba( tmp.reshape(1, -1))[0] elif cal_method == 'Isotonic' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ dist[1] ] y_train[i] = [instance.get_value(instance.class_index)] dist = tree.distribution_for_instance(instances.get_instance(index))[1] tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : ir = IR( out_of_bounds = 'clip' ) ir.fit(np.ravel(p_train,order='C') , np.ravel(y_train,order='C')) p = ir.transform( np.ravel(tmp,order='C'))[0] return [p,1-p] # elif cal_method == 'ProbabilityCalibrationTree' : # pass elif cal_method == 'ICP' : pass elif cal_method == 'Venn1' : calibrPts = [] for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) score = dist[0] if dist[1] < dist[0] else dist[1] calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) dist = (tree.distribution_for_instance(instances.get_instance(index))) score = dist[0] if dist[1] < dist[0] else dist[1] tmp = [score] p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp) print("Vennnnnn =========>>>>>>>>>>>> ", p0, " , ",p1) return [p0,p1] pass
# X = np.random.rand(N) # X = np.sort(X) # rs = check_random_state(312312) # Y = rs.randint(-10, 10, size=(N,)) + 10. * np.log1p(np.arange(N)) L = 20 / 0.4 # L = 3 * 100 plt.plot(X, Y, 'o', label="Y") idx_vector = np.arange(N) ir = IsotonicRegression() ir = ir.fit(X, Y) Y_iso = ir.transform(X) plt.plot(X, Y_iso, '-d', label="iso(Y)") plt.legend() T = np.linspace(0.001, 0.999, 50) f = ir.predict(T) f[T < X[0]] = Y_iso[0] f[T > X[-1]] = Y_iso[-1] delta = 0.1 # for idx in range(len(T)): # X_new = T[idx] # if X_new < X[0]: # lb = -L * np.abs(X_new - X[0]) - np.sqrt(2*np.log((N**2 + N)/delta)) # lbm = 1
class Forecaster(nn.Module): def __init__(self, args): super(Forecaster, self).__init__() self.args = args def eval_all(self, bx, by): br = torch.rand(bx.shape[0], 1, device=bx.device) mean, stddev = self.forward(bx=bx, br=br) cdf = 0.5 * (1.0 + torch.erf((by - mean) / stddev / math.sqrt(2))) loss_cdf = torch.abs(cdf - br).mean() eps = 1e-5 loss_cdf_kl = cdf * (torch.log(cdf + eps) - torch.log(br + eps)) + \ (1 - cdf) * (torch.log(1 - cdf + eps) - torch.log(1 - br + eps)) loss_cdf_kl = loss_cdf_kl.mean() loss_stddev = stddev.mean() # loss_l2 = ((by - mean) ** 2).mean() # Log likelihood of by under the predicted Gaussian distribution loss_nll = torch.log(stddev) + math.log(2 * math.pi) / 2.0 + (( (by - mean) / stddev)**2 / 2.0) loss_nll = loss_nll.mean() return cdf, loss_cdf * ( 1 - self.args.klcoeff ) + loss_cdf_kl * self.args.klcoeff, loss_stddev, loss_nll def eval_in_batch(self, bx, by, batch_size): pass def recalibrate(self, bx, by): with torch.no_grad(): cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float) cdf = np.sort(cdf) lin = np.linspace(0, 1, int(cdf.shape[0])) # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6) cdf = np.insert(np.insert(cdf, -1, 1), 0, 0) lin = np.insert(np.insert(lin, -1, 1), 0, 0) self.iso_transform = IsotonicRegression() self.iso_transform.fit_transform(cdf, lin) def apply_recalibrate(self, cdf): if self.iso_transform is not None: # If input tensor output tensor # If input numpy array output numpy array is_torch = False if isinstance(cdf, type(torch.zeros(1))): device = cdf.get_device() cdf = cdf.cpu().numpy() is_torch = True original_shape = cdf.shape new_cdf = np.reshape(self.iso_transform.transform(cdf.flatten()), original_shape) if is_torch: new_cdf = torch.from_numpy(new_cdf).to(device) return new_cdf else: return cdf
class QuantileCalibration: """Quantile calibration based on Kuleshov et al. (2018): https://arxiv.org/abs/1807.00263 Learns the relationship between predicted and empirical quantiles of the posterior predictive based on observations using isotonic regression. """ def __init__(self): self.isotonic = None self.isotonic_inverse = None def fit(self, y, post_pred): """Train isotonic regression on predicted and empirical quantiles Constructs a recalibration dataset from the posterior predictive and observations of the response variable Y. Learns the inverse relationship between the two using isotonic regression. Args: y: the response variable, array of shape (T,) or (T, 1) post_pred: samples of the posterior predictive, array of shape (N, T) Returns: self: a fitted instance of the QuantileCalibration class """ assert y.shape[0] == post_pred.shape[ 1], "y.shape[0] must match post_pred.shape[1]" # Build a recalibration dataset predicted, empirical = make_cal_dataset(y, post_pred) # Fit the recalibration dataset in forward mode: from predicted to empirical self.isotonic = IsotonicRegression(out_of_bounds="clip") self.isotonic.fit(predicted, empirical) # Fit the recalibration dataset in reverse: from empirical to predicted self.isotonic_inverse = IsotonicRegression(out_of_bounds="clip") self.isotonic_inverse.fit(empirical, predicted) return self def transform(self, quantiles): """Forward transform the values of the predicted quantiles to the empirical quantiles using a previously learned relationship. Args: quantiles: a 1-dimensional array Returns: empirical_quantiles: the values of the empirical quantiles corresponding to the predicted quantiles in the posterior predictive, a 1-dimensional array """ assert self.isotonic is not None, "The calibration instance must be fit first" empirical_quantiles = self.isotonic.transform(quantiles) return empirical_quantiles def inverse_transform(self, quantiles): """Inverse transform the values of the desired (empirical) quantiles to the predicted quantiles using a previously learned relationship. Args: quantiles: a 1-dimensional array Returns: predicted_quantiles: the values of the predicted quantiles corresponding to the desired quantiles in the posterior predictive, a 1-dimensional array """ assert self.isotonic_inverse is not None, "The calibration instance must be fit first" predicted_quantiles = self.isotonic_inverse.transform(quantiles) return predicted_quantiles
def transform(self, X): return IsotonicRegression.transform(self, T=X)
class LLRIsotonicRegression(BaseEstimator, TransformerMixin): def __init__(self, equal_priors=False, y_min=1e-4, y_max=1. - 1e-4, plottable=False): super(LLRIsotonicRegression, self).__init__() self.equal_priors = equal_priors self.y_min = y_min self.y_max = y_max self.plottable = plottable def fit(self, X, y): X, y = keepZeroOrOne(X, y, reshape=(-1, )) if self.plottable: self.X_ = X self.y_ = y if self.equal_priors: positive = X[y == 1] n_positive = len(positive) negative = X[y == 0] n_negative = len(negative) if n_positive > n_negative: # downsample positive examples positive = np.random.choice(positive, size=(n_negative, ), replace=False) n_positive = len(positive) else: # downsample negative examples negative = np.random.choice(negative, size=(n_positive, ), replace=False) n_negative = len(negative) X = np.hstack([negative, positive]) y = np.hstack([ np.zeros((n_negative, ), dtype=int), np.ones((n_positive, ), dtype=int) ]) n_samples = X.shape[0] # hack for numpy _X_, f8 = str('X'), str('f8') _y_, i1 = str('y'), str('i1') Xy = np.zeros((n_samples, ), dtype=[(_X_, f8), (_y_, i1)]) Xy[_X_] = X Xy[_y_] = y sorted_Xy = np.sort(Xy, order=_X_) self.regression_ = IsotonicRegression(y_min=self.y_min, y_max=self.y_max, out_of_bounds='clip') self.regression_.fit(sorted_Xy[_X_], sorted_Xy[_y_]) return self def transform(self, X): shape = X.shape p = self.regression_.transform(X.reshape((-1, ))) p = p.reshape(shape) return np.log(p) - np.log(1. - p) def _repr_png_(self): from pyannote.core.notebook import plt, _render # remember current figure size figsize = plt.rcParams['figure.figsize'] # and update it for segment display plt.rcParams['figure.figsize'] = (5, 10) fig, (ax1, ax2, ax3) = plt.subplots(3, 1) _, bins = np.histogram(self.X_, bins=100) mu, sigma = np.mean(self.X_), np.std(self.X_) m = mu - 3 * sigma M = mu + 3 * sigma # m, M = np.min(self.X_), np.max(self.X_) positive = self.X_[self.y_ == 1] negative = self.X_[self.y_ == 0] ax1.hist(positive, bins=bins, alpha=0.5, color='g', normed=True) ax1.hist(negative, bins=bins, alpha=0.5, color='r', normed=True) ax1.set_xlim(m, M) t = np.linspace(m, M, 50) ax2.plot(t, self.transform(t)) ax2.plot([m, M], [0, 0], 'k--') ax2.set_xlim(m, M) ax3.plot(t, posterior(self.transform(t))) ax3.plot([m, M], [0.5, 0.5], 'k--') ax3.set_xlim(m, M) ax3.set_ylim(-0.1, 1.1) data = _render(fig) # go back to previous figure size plt.rcParams['figure.figsize'] = figsize return data
def calibrated(test_predictions, oof_predictions, flag_transform=sigmoid, type_transform=parse_classifier_probas): """ Update test predictions w.r.t to calibration trained on OOF predictions :param test_predictions: :param oof_predictions: :return: """ from sklearn.isotonic import IsotonicRegression as IR import matplotlib.pyplot as plt oof_predictions = oof_predictions.copy() oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = oof_predictions[ OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform) oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = oof_predictions[ OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform) test_predictions = test_predictions.copy() test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = test_predictions[ OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform) test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = test_predictions[ OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform) y_true = oof_predictions["true_modification_flag"].values.astype(int) # print("Target", np.bincount(oof_predictions["true_modification_type"].values.astype(int))) if True: y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values b_auc_before = alaska_weighted_auc(y_true, y_pred_raw) ir_flag = IR(out_of_bounds="clip", y_min=0, y_max=1) y_pred_cal = ir_flag.fit_transform(y_pred_raw, y_true) b_auc_after = alaska_weighted_auc(y_true, y_pred_cal) if b_auc_after > b_auc_before: test_predictions[ OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform( test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values) else: # test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform( # test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values # ) warnings.warn( f"Failed to train IR flag {b_auc_before} {b_auc_after}") plt.figure() plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {b_auc_after}") plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {b_auc_before}") plt.yscale("log") plt.legend() plt.show() if True: ir_type = IR(out_of_bounds="clip", y_min=0, y_max=1) y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values c_auc_before = alaska_weighted_auc(y_true, y_pred_raw) y_pred_cal = ir_type.fit_transform(y_pred_raw, y_true) c_auc_after = alaska_weighted_auc(y_true, y_pred_cal) if c_auc_after > c_auc_before: test_predictions[ OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform( test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values) # plt.figure() # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}") # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}") # plt.yscale("log") # plt.legend() # plt.show() else: # test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform( # test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values # ) warnings.warn( f"Failed to train IR on type {c_auc_before} {c_auc_after}") # plt.figure() # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}") # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}") # plt.yscale("log") # plt.legend() # plt.show() results = { "b_auc_before": b_auc_before, "b_auc_after": b_auc_after, "c_auc_before": c_auc_before, "c_auc_after": c_auc_after, } return test_predictions, results
is_contact[is_contact < 0.5], alpha=0.1, s=SIZE, color='k') sc_1 = plt.scatter(gdca_scores[is_contact > 0.5], is_contact[is_contact > 0.5], alpha=0.1, s=SIZE, color='k') sc_0.set_rasterized(True) sc_1.set_rasterized(True) print('--', time.time() - t0) mean, edges, _ = stats.binned_statistic(gdca_scores, is_contact, bins=bins) centres = (edges[:-1] + edges[1:]) / 2 plt.plot(centres, mean, color=settings.BLUE, alpha=0.5, linestyle='--') plt.plot(x, iso.transform(x), color=settings.MAROON, linewidth=2) plt.xlabel('Contact score') plt.ylabel('Contact probability') #plt.title('Isotonic regression') plt.xlim(-1.2, 4.2) plt.xticks(range(-1, 5)) plt.tight_layout() print('.', time.time() - t0) plt.savefig('../figures/isotonic.pdf') print('!', time.time() - t0) plt.show()