def calibration_isotonic_regression(model_name, model, prob_model, X_calibration, y_calibration, X_train): # 1. function that trains the calibration regressor using as input calibration data in the first instance # 2. it then takes in the prob_out of the mdel on the test and outputs calibrated prob for further calculation of # calibrated std # ref: https: // arxiv.org / abs / 1807.00263 if model_name == 'Bayes_Ridge_model': y_hat_calibration, sem_hat_calibration = model.predict(X_calibration, return_std=True) elif model_name == 'RF_model': y_hat_calibration = model.predict(X_calibration) sem_hat_calibration = np.sqrt( fci.random_forest_error(model, X_train, X_calibration)) else: print('Error: Not able to calculate variace!') # y_hat, sem = model.predict(X_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval( y_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated
def __call__(self, valid_preacts, valid_labels): ir = IR() valid_preacts = valid_preacts.flatten() min_valid_preact = np.min(valid_preacts) max_valid_preact = np.max(valid_preacts) assert len(valid_preacts) == len(valid_labels) #sorting to be safe...I think weird results can happen when unsorted sorted_valid_preacts, sorted_valid_labels = zip( *sorted(zip(valid_preacts, valid_labels), key=lambda x: x[0])) y = ir.fit_transform(sorted_valid_preacts, sorted_valid_labels) def calibration_func(preact): preact = np.minimum(preact, max_valid_preact) preact = np.maximum(preact, min_valid_preact) return ir.transform(preact.flatten()) return calibration_func
def calibration_isotonic_regression(data_calibration, prob_model): # calibration function y_true_calibration, y_hat_calibration, sem_hat_calibration = predict_w_DNN( data_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval( y_true_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_true_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated
def calibrate_probabilities(prob_dict,instance_label_dict): labels = [] probabilities = [] print(len(prob_dict)) print(len(instance_label_dict)) for i in prob_dict: labels.append(instance_label_dict[i]) probabilities.append(prob_dict[i]) ir = IR(out_of_bounds='clip') ir.fit(probabilities,labels) #fit ir to abstract level precision and classes p_calibrated=ir.transform(probabilities) fig,ax = plt.subplots() fraction_of_positives, mean_predicted_value = calibration_curve(labels, p_calibrated, n_bins=10) ax.plot(mean_predicted_value, fraction_of_positives) fraction_of_positives, mean_predicted_value = calibration_curve(labels, probabilities, n_bins=10) ax.plot(mean_predicted_value, fraction_of_positives) plt.savefig('calibration_curve_on_data.png') return ir
def isotonic_calibration(self, xtrain, ytrain): ir = IR(out_of_bounds='clip') ir.fit(xtrain, ytrain) #print ir return ir
def __init__(self, c, device): self.c = c self.ir = IR(out_of_bounds='clip') self.device = device
def calculate_probability_distribution(tree , instances , index , cal_method =None): if cal_method == None : return tree.distribution_for_instance(instances.get_instance(index)) elif cal_method == 'Platt' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ (dist[1] - 0.5)*2.0 ] y_train[i] = [instance.get_value(instance.class_index)] # print("p_train ====>>>" , p_train) # print("y_train ====>>>" , y_train) dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0 tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : warnings.filterwarnings("ignore", category=FutureWarning) lr = LR(solver='lbfgs') lr.fit( p_train , np.ravel(y_train,order='C') ) return lr.predict_proba( tmp.reshape(1, -1))[0] elif cal_method == 'Isotonic' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ dist[1] ] y_train[i] = [instance.get_value(instance.class_index)] dist = tree.distribution_for_instance(instances.get_instance(index))[1] tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : ir = IR( out_of_bounds = 'clip' ) ir.fit(np.ravel(p_train,order='C') , np.ravel(y_train,order='C')) p = ir.transform( np.ravel(tmp,order='C'))[0] return [p,1-p] # elif cal_method == 'ProbabilityCalibrationTree' : # pass elif cal_method == 'ICP' : pass elif cal_method == 'Venn1' : calibrPts = [] for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) score = dist[0] if dist[1] < dist[0] else dist[1] calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) dist = (tree.distribution_for_instance(instances.get_instance(index))) score = dist[0] if dist[1] < dist[0] else dist[1] tmp = [score] p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp) print("Vennnnnn =========>>>>>>>>>>>> ", p0, " , ",p1) return [p0,p1] pass
def calibrated(test_predictions, oof_predictions, flag_transform=sigmoid, type_transform=parse_classifier_probas): """ Update test predictions w.r.t to calibration trained on OOF predictions :param test_predictions: :param oof_predictions: :return: """ from sklearn.isotonic import IsotonicRegression as IR import matplotlib.pyplot as plt oof_predictions = oof_predictions.copy() oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = oof_predictions[ OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform) oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = oof_predictions[ OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform) test_predictions = test_predictions.copy() test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = test_predictions[ OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform) test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = test_predictions[ OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform) y_true = oof_predictions["true_modification_flag"].values.astype(int) # print("Target", np.bincount(oof_predictions["true_modification_type"].values.astype(int))) if True: y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values b_auc_before = alaska_weighted_auc(y_true, y_pred_raw) ir_flag = IR(out_of_bounds="clip", y_min=0, y_max=1) y_pred_cal = ir_flag.fit_transform(y_pred_raw, y_true) b_auc_after = alaska_weighted_auc(y_true, y_pred_cal) if b_auc_after > b_auc_before: test_predictions[ OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform( test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values) else: # test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform( # test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values # ) warnings.warn( f"Failed to train IR flag {b_auc_before} {b_auc_after}") plt.figure() plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {b_auc_after}") plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {b_auc_before}") plt.yscale("log") plt.legend() plt.show() if True: ir_type = IR(out_of_bounds="clip", y_min=0, y_max=1) y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values c_auc_before = alaska_weighted_auc(y_true, y_pred_raw) y_pred_cal = ir_type.fit_transform(y_pred_raw, y_true) c_auc_after = alaska_weighted_auc(y_true, y_pred_cal) if c_auc_after > c_auc_before: test_predictions[ OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform( test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values) # plt.figure() # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}") # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}") # plt.yscale("log") # plt.legend() # plt.show() else: # test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform( # test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values # ) warnings.warn( f"Failed to train IR on type {c_auc_before} {c_auc_after}") # plt.figure() # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}") # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}") # plt.yscale("log") # plt.legend() # plt.show() results = { "b_auc_before": b_auc_before, "b_auc_after": b_auc_after, "c_auc_before": c_auc_before, "c_auc_after": c_auc_after, } return test_predictions, results
trainResult = runffm('../data/calibration/ffmTrain-102662.ffm', '../data/calibration/ffm-model-102662') p_train_all = read_csv(trainResult)['prob'] oriTrain = read_csv('../data/train.csv') sameTrain = oriTrain[oriTrain['clickTime'] >= 190000].reset_index() print len(sameTrain), len(p_train_all) part_sameTrain = sameTrain[(sameTrain['clickTime'] >= 200000) & (sameTrain['clickTime'] < 290000)] p_train = p_train_all.loc[part_sameTrain.index] y_train = part_sameTrain['label'] ir = IR() ir.fit(p_train, y_train) oriResult = read_csv( '../data/calibration/ffm_mergeAppUser_s17_preAction_190000_no_Dist_noNum_t150_k8_l2e-05_2017-06-05-20-58-00.csv' ) p_test = oriResult['prob'] p_calibrated = ir.transform( p_test) # or ir.fit( p_test ), that's the same thing oriResult['new_prob'] = Series(p_calibrated) oriResult.to_csv('../data/calibration/calib_temp.csv', index=False) oriResult['nozero_new_prob'] = oriResult.apply( lambda x: x['new_prob'] if x['new_prob'] > 0 else x['prob'], axis='columns')
def Iso(self): IReg = IR(y_min=None, y_max=None, increasing=True, out_of_bounds='nan') pass
def calibrate_probs(probabilities,classes): ir = IR(out_of_bounds='clip') ir.fit(probabilities,classes) #fit ir to abstract level precision and classes p_calibrated=ir.transform(probabilities) return p_calibrated
# convert to termonilogy used in all python files so far (redundant from a code perspective) y_true = y_sample.flatten() y_hat = m # predicted mean return y_true, y_hat, varma def calibration_isotonic_regression(data_calibration, prob_model): # calibration function y_true_calibration, y_hat_calibration, sem_hat_calibration = predict_w_DNN(data_calibration) prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval(y_true_calibration, y_hat_calibration, sem_hat_calibration) prob_model_y_calibration = predict_prob(y_true_calibration, y_hat_calibration, sem_hat_calibration) # isotonic regression from sklearn.isotonic import IsotonicRegression as IR ir = IR(out_of_bounds='clip') ir.fit(prob_model_y_calibration, prob_y_calibration) prob_test_calibrated = ir.transform(prob_model) return prob_test_calibrated r2_vec = [] mape_vec = [] rmspe_vec = [] mse_vec = [] acc_zone_percentage_vec = [] beta_vec = [] rlh_vec = [] avg_calibration_vec = [] SH_vec = []
### # train/test split (in half) train_end = y.shape[0] / 2 test_start = train_end + 1 y_train = y[0:train_end] y_test = y[test_start:] p_train = p[0:train_end] p_test = p[test_start:] ### ir = IR(out_of_bounds='clip') # out_of_bounds param needs scikit-learn >= 0.15 ir.fit(p_train, y_train) p_calibrated = ir.transform(p_test) p_calibrated[np.isnan(p_calibrated)] = 0 ### acc = accuracy_score(y_test, np.round(p_test)) acc_calibrated = accuracy_score(y_test, np.round(p_calibrated)) auc = AUC(y_test, p_test) auc_calibrated = AUC(y_test, p_calibrated) ll = log_loss(y_test, p_test) ll_calibrated = log_loss(y_test, p_calibrated)