def eval_cal(y_preds, y_true, bins=15): # Calibration Metrics ece = ECE(bins) ace = ACE(bins) mce = MCE(bins) ece_score = ece.measure(y_preds, y_true) ace_score = ace.measure(y_preds, y_true) mce_score = mce.measure(y_preds, y_true) return ece_score, ace_score, mce_score
def measure_miscalibration(bins: Union[tuple, list, int], data: dict, methods0d: list, methods2d: list): """ Measure miscalibration and write to stdout. Parameters ---------- bins : iterable or int Number of bins used by ACE, ECE and MCE. data : dict Dictionary of calibration data. methods0d : list List with strings containing the keys for the calibration data (confidence only methods). methods2d : list List with strings containing the keys for the calibration data (2D methods). """ # iterate over 0D and 2D methods for i, methods in enumerate([methods0d, methods2d]): # insert 'confidence' key to the first place in the list to keep track of default miscalibration if i == 1: methods = ['confidence'] + methods0d + methods2d else: methods = ['confidence'] + methods # on confidence only, use one single value (the first one) bins = bins[0] if i == 0 and isinstance(bins, (tuple, list)) else bins # create instances for measuring miscalibration ace = ACE(bins=bins, detection=True) ece = ECE(bins=bins, detection=True) mce = MCE(bins=bins, detection=True) # initialize empty lists ace_list = [] ece_list = [] mce_list = [] # iterate over all methods for method in methods: data_input = data[method] if i == 0 else np.stack( (data[method], data['cx'], data['cy']), axis=1) ace_list.append(ace.measure(data_input, data['matched'])) ece_list.append(ece.measure(data_input, data['matched'])) mce_list.append(mce.measure(data_input, data['matched'])) # output formatted ECE names = [len(x) for x in methods] buffer = max(names) # write out all miscalibration results in a 'pretty' manner for j, method in enumerate(methods): fill = (buffer - len(method)) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % (method, fill, ace_list[j], ece_list[j], mce_list[j]))
def single_example(models: list, datafile: str, bins: int, diagram: str = None, validation_split: float = 0.7, save_models: bool = False, domain: str = ".") -> int: """ Measure miscalibration of given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. diagram : str, optional, default: None Type of diagram wich should be plotted. This could be 'diagram', 'curve', 'inference' or None. validation_split : float Split ratio between build set and validation set. save_models : bool True if instances of calibration methods should be stored. domain : str, optional, default: "." Domain/directory where to store the results. Returns ------- int 0 on success, -1 otherwise """ if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, test_size=validation_split, stratify=ground_truth, random_state=None) # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) predictions = [] all_ace = [ace.measure(validation_set_sm, validation_set_gt)] all_ece = [ece.measure(validation_set_sm, validation_set_gt)] all_mce = [mce.measure(validation_set_sm, validation_set_gt)] # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("%s/models/%s.pkl" % (domain, name)) # ------------------------------------------ # perform predictions for model in models: _, instance = model prediction = instance.transform(validation_set_sm) predictions.append(prediction) all_ace.append(ace.measure(prediction, validation_set_gt)) all_ece.append(ece.measure(prediction, validation_set_gt)) all_mce.append(mce.measure(prediction, validation_set_gt)) # ------------------------------------------ # output formatted ECE names = [len(x[0]) for x in models] buffer = max(names) fill = (buffer - len("Default")) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_ace[0], all_ece[0], all_mce[0])) for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_ace[i], all_ece[i], all_mce[i])) # ------------------------------------------ if diagram == 'diagram': diagram = ReliabilityDiagram(bins=bins, title_suffix="default") diagram.plot(validation_set_sm, validation_set_gt, filename="test.png") for i, prediction in enumerate(predictions): diagram = ReliabilityDiagram(bins=bins, title_suffix=models[i][0]) diagram.plot(prediction, validation_set_gt) elif diagram is None: pass else: print("Unknown diagram type \'%s\'" % diagram) return -1 return 0
def cross_validation_5_2(models: list, datafile: str, bins: int, save_models: bool = False, domain: str = '.') -> int: """ 5x2 cross validation on given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. save_models : bool, optional, default: False True if instances of calibration methods should be stored. domain : str, optional, default: "." Domain/directory where to store the results. Returns ------- int 0 on success, -1 otherwise """ network = datafile[datafile.rfind("/") + 1:datafile.rfind(".npz")] seeds = [60932, 29571058, 127519, 23519410, 74198274] if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 if len(predictions.shape) == 2: n_classes = predictions.shape[1] else: n_classes = 2 # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) all_accuracy = [] all_ace = [] all_ece = [] all_mce = [] it = 0 for i, seed in enumerate(seeds): np.random.seed(seed) # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, random_state=seed, test_size=0.5, stratify=ground_truth) for j in range(2): calibrated_data = {} # 5x2 cross validation - flip build/val set after each iteration build_set_gt, validation_set_gt = validation_set_gt, build_set_gt build_set_sm, validation_set_sm = validation_set_sm, build_set_sm # lists for error metrics for current iteration (it) it_all_accuracy = [] it_all_ace = [] it_all_ece = [] it_all_mce = [] if n_classes > 2: labels = np.argmax(validation_set_sm, axis=1) else: labels = np.where(validation_set_sm > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(validation_set_sm, validation_set_gt)) it_all_ece.append(ece.measure(validation_set_sm, validation_set_gt)) it_all_mce.append(mce.measure(validation_set_sm, validation_set_gt)) # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("%s/models/%s-%s-%d.pkl" % (domain, network, name, i)) prediction = instance.transform(validation_set_sm) calibrated_data[name] = prediction if n_classes > 2: if prediction.ndim == 3: prediction = np.mean(prediction, axis=0) labels = np.argmax(prediction, axis=1) else: if prediction.ndim == 2: prediction = np.mean(prediction, axis=0) labels = np.where(prediction > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(prediction, validation_set_gt)) it_all_ece.append(ece.measure(prediction, validation_set_gt)) it_all_mce.append(mce.measure(prediction, validation_set_gt)) # append lists of current iterations all_accuracy.append(it_all_accuracy) all_ace.append(it_all_ace) all_ece.append(it_all_ece) all_mce.append(it_all_mce) filename = "%s/results/%s_%02d.npz" % (domain, network, it) with open(filename, "wb") as open_file: np.savez_compressed(open_file, train_gt=build_set_gt, test_gt=validation_set_gt, train_scores=build_set_sm, test_scores=validation_set_sm, **calibrated_data) it += 1 # convert to NumPy arrays and reduce mean afterwards all_accuracy = np.array(all_accuracy) all_ace = np.array(all_ace) all_ece = np.array(all_ece) all_mce = np.array(all_mce) all_accuracy = np.mean(all_accuracy, axis=0) all_ace = np.mean(all_ace, axis=0) all_ece = np.mean(all_ece, axis=0) all_mce = np.mean(all_mce, axis=0) names = [len(x[0]) for x in models] buffer = max(names) # --------------------------------------------------------- # output formatted ECE fill = (buffer - len("Default")) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0])) # --------------------------------------------------------- for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i])) return 0
def cross_validation_5_2(models: list, datafile: str, bins: int, save_models: bool = False) -> int: """ 5x2 cross validation on given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. save_models : bool, optional, default: False True if instances of calibration methods should be stored. Returns ------- int 0 on success, -1 otherwise """ if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 if len(predictions.shape) == 2: n_classes = predictions.shape[1] else: n_classes = 2 # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) all_accuracy = [] all_ace = [] all_ece = [] all_mce = [] for i in range(5): # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, test_size=0.5, stratify=ground_truth) for _ in range(2): # 5x2 cross validation - flip build/val set after each iteration build_set_gt, validation_set_gt = validation_set_gt, build_set_gt build_set_sm, validation_set_sm = validation_set_sm, build_set_sm # lists for error metrics for current iteration (it) it_all_accuracy = [] it_all_ace = [] it_all_ece = [] it_all_mce = [] if n_classes > 2: labels = np.argmax(validation_set_sm, axis=1) else: labels = np.where(validation_set_sm > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(validation_set_sm, validation_set_gt)) it_all_ece.append(ece.measure(validation_set_sm, validation_set_gt)) it_all_mce.append(mce.measure(validation_set_sm, validation_set_gt)) # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("./models/%s_run_%d.pkl" % (name, i)) # ------------------------------------------ # perform predictions for model in models: _, instance = model prediction = instance.transform(validation_set_sm) if n_classes > 2: labels = np.argmax(prediction, axis=1) else: labels = np.where(prediction > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(prediction, validation_set_gt)) it_all_ece.append(ece.measure(prediction, validation_set_gt)) it_all_mce.append(mce.measure(prediction, validation_set_gt)) # append lists of current iterations all_accuracy.append(it_all_accuracy) all_ace.append(it_all_ace) all_ece.append(it_all_ece) all_mce.append(it_all_mce) # convert to NumPy arrays and reduce mean afterwards all_accuracy = np.array(all_accuracy) all_ace = np.array(all_ace) all_ece = np.array(all_ece) all_mce = np.array(all_mce) all_accuracy = np.mean(all_accuracy, axis=0) all_ace = np.mean(all_ace, axis=0) all_ece = np.mean(all_ece, axis=0) all_mce = np.mean(all_mce, axis=0) names = [len(x[0]) for x in models] buffer = max(names) # --------------------------------------------------------- # output formatted ECE fill = (buffer - len("Default")) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0])) # --------------------------------------------------------- for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i])) return 0
def get_model_diagnosis(df, strategy='quantile', rps_col_prefix='model', add_baseline=False): """ Diagnosis Plots: Accepts a DataFrame containing columns: ordinal_result_1 ordinsl_result_2 ordinal_result_3 1 2 3 The columns are paired as follows: "ordinal_result_1" represents a binary column defining whether a home win event occurred, and column named "1" contains the corresponding model probabilities Same for ordinal_result_2, and 2 and ordinal_result_3 and 3 strategy{‘uniform’, ‘quantile’}, (default=’uniform’) Strategy used to define the widths of the bins. uniform All bins have identical widths. quantile All bins have the same number of points. RPS Plots: Accepts a DataFrame containing columns: ordinal_result "rps_col_prefix"_rps and optional columns named rps_baseline_1 2 3 The columns are paired as follows: "ordinal_result_1" represents a binary column defining whether a home win event occurred, and column named "1" contains the corresponding model probabilities Same for ordinal_result_2, and 2 and ordinal_result_3 and 3 """ fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10)) ax1, ax2, ax3 = axes[:, 0] n_bins = 10 mapper = {1: 'Home Win', 2: 'Draw', 3: 'Away Win'} for col, ax in zip([1, 2, 3], (ax1, ax2, ax3)): fop, mpv = calibration_curve(df['ordinal_result_' + str(col)], df[col], n_bins=n_bins, strategy=strategy) # plot perfectly calibrated ax.plot([0, 1], [0, 1], linestyle='--') # plot model reliability ax.plot(mpv, fop, marker='.') ax.set_title(mapper[col]) ax4, ax5, ax6 = axes[:, 1] n_bins = 10 mapper = {1: 'Home Win RPS', 2: 'Draw RPS', 3: 'Away Win RPS'} for col, ax in zip([1, 2, 3], (ax4, ax5, ax6)): rpss = df[df['ordinal_result'] == col][rps_col_prefix + '_rps'] ax.hist(rpss, bins=n_bins) ax.set_xlim(0, 1.0) baseline_col_name = 'rps_baseline_' + str(col) if add_baseline and baseline_col_name in df.columns: ax.axvline(df['rps_baseline_1'].unique(), color='r') median = rpss.median() ax.axvline(median, color='r', linestyle='dashed', label=f'Median: {median:.3f}') ax.set_title(mapper[col]) ax.legend() ax.grid() pred_arr, act_arr = df[[1, 2, 3]].values, df['ordinal_result'].values ace = ACE(bins=n_bins) ace_val = ace.measure(pred_arr, act_arr) ece = ECE(bins=n_bins) ece_val = ece.measure(pred_arr, act_arr) mce = MCE(bins=n_bins) mce_val = mce.measure(pred_arr, act_arr) print( f'Average Calibration Error: {ace_val:.3f}\nExpected Calibration Error: {ece_val:.3f}\nMaximum Calibration Error: {mce_val:.3f}' ) print(f"Number of Instances: {len(df)}") return fig, (ax1, ax2, ax3, ax4, ax5, ax6)