def main(): dataset = 'cifar100' num_samples = 1000 datafile = DATAFILE_LIST[dataset] num_classes = NUM_CLASSES_DICT[dataset] categories, observations, confidences, idx2category, category2idx, labels = prepare_data(datafile, False) # accuracy models accuracy_model = BetaBernoulli(k=num_classes, prior=None) accuracy_model.update_batch(categories, observations) # ece models for each class ece_model = ClasswiseEce(num_classes, num_bins=10, pseudocount=2) ece_model.update_batch(categories, observations, confidences) # draw samples from posterior of classwise accuracy accuracy_samples = accuracy_model.sample(num_samples) # (num_categories, num_samples) ece_samples = ece_model.sample(num_samples) # (num_categories, num_samples) accuracy = np.array([np.quantile(accuracy_samples, 0.025, axis=1), np.quantile(accuracy_samples, 0.5, axis=1), np.quantile(accuracy_samples, 0.975, axis=1)]).T ece = np.array([np.quantile(ece_samples, 0.025, axis=1), np.quantile(ece_samples, 0.5, axis=1), np.quantile(ece_samples, 0.975, axis=1)]).T fig, axes = plot_figure_1(accuracy, ece, labels=CIFAR100_CLASSES, limit=10, reverse=False) fig.tight_layout() fig.subplots_adjust(bottom=-0.2, wspace=0.35) fig.set_size_inches(COLUMN_WIDTH * 1.3, 2.0) fig.savefig(FIGURE_DIR + 'figure1.pdf', bbox_inches="tight", pad_inches=0.05)
def thompson_sampling(deques: List[deque], model: BetaBernoulli, mode: str, topk: int = 1, **kwargs) -> int: samples = model.sample() if mode == 'max': ranked = np.argsort(samples)[::-1] elif mode == 'min': ranked = np.argsort(samples) if topk == 1: for category in ranked: if len(deques[category]) != 0: return category else: categories_list = [] candidates = set([i for i in range(len(deques)) if len(deques[i]) > 0]) # when we go through 'ranked' and len(categories_list) < topk, topk sampling is reduced to top 1 if len(candidates) < topk: return thompson_sampling(deques, model, mode, topk=1) else: for category in ranked: if category in candidates: categories_list.append(category) if len(categories_list) == topk: return categories_list
def select_and_label(dataset: 'Dataset', sample_method: str, prior=None, weighted=False, topk=1) -> np.ndarray: model = BetaBernoulli(dataset.num_groups, prior=prior, weight=dataset.weight_k) deques = dataset.enqueue() dataset_len = dataset.__len__() dataset_num_groups = dataset.num_groups del dataset sampled_indices = np.zeros((dataset_len, ), dtype=np.int) # indices of selected data points mpe_log = np.zeros((dataset_len // LOG_FREQ, dataset_num_groups)) sample_fct = SAMPLE_CATEGORY[sample_method] idx = 0 mpe_log[0] = model.mpe while idx < dataset_len: if sample_method == 'ts': reward = model.reward(reward_type=args.metric) else: # no need to compute reward for non-ts methods reward = None categories = sample_fct(deques=deques, reward=reward, weighted=weighted, topk=topk) if topk == 1 or sample_method != 'ts': categories = [categories] for category in categories: selected = deques[category].pop() # a dictionary model.update(category, selected) sampled_indices[idx] = selected['index'] if (idx + 1) % LOG_FREQ == 0: mpe_log[idx // LOG_FREQ] = model.mpe idx += 1 return sampled_indices, mpe_log
def select_and_label(dataset: 'Dataset', sample_method: str, budget: int, group0: int, group1: int, \ prior=None, weighted=False) -> np.ndarray: model = BetaBernoulli(dataset.num_groups, prior=prior, weight=dataset.weight_k) deques = dataset.enqueue() for i in range(len(deques)): if i not in [group0, group1]: deques[i].clear() sampled_indices = np.zeros((budget, ), dtype=np.int) # indices of selected data points mpe_log = np.zeros((budget // LOG_FREQ, dataset.num_groups)) rope_eval = np.zeros((budget // LOG_FREQ, 3)) sample_fct = SAMPLE_CATEGORY[sample_method] idx = 0 while idx < budget: if sample_method == 'ts': reward = model.reward(reward_type='difference', group0=group0, group1=group1) category = sample_fct(deques=deques, reward=reward) else: category = sample_fct(deques=deques, weighted=weighted) selected = deques[category].pop() # a dictionary model.update(category, selected) sampled_indices[idx] = selected['index'] if (idx + 1) % LOG_FREQ == 0: mpe_log[idx // LOG_FREQ] = model.mpe alpha0, beta0 = model._params[group0] alpha1, beta1 = model._params[group1] rope_eval[idx // LOG_FREQ] = rope(alpha0, alpha1, beta0, beta1) idx += 1 return { 'sampled_indices': sampled_indices, 'mpe_log': mpe_log, 'rope_eval': rope_eval }
def main() -> None: with mpl.rc_context(rc=DEFAULT_RC): fig, axes = plt.subplots(ncols=3, nrows=2, dpi=300, sharey=False) idx = 0 for dataset in DATASET_NAMES: datafile = DATAFILE_LIST[dataset] num_classes = NUM_CLASSES_DICT[dataset] categories, observations, confidences, idx2category, category2idx, labels = prepare_data(datafile, False) # accuracy models accuracy_model = BetaBernoulli(k=num_classes, prior=None) accuracy_model.update_batch(categories, observations) # ece models for each class ece_model = ClasswiseEce(num_classes, num_bins=10, pseudocount=2) ece_model.update_batch(categories, observations, confidences) # draw samples from posterior of classwise accuracy accuracy_samples = accuracy_model.sample(num_samples) # (num_categories, num_samples) ece_samples = ece_model.sample(num_samples) # (num_categories, num_samples) plot_kwargs = {} axes[idx // 3, idx % 3] = plot_scatter(axes[idx // 3, idx % 3], accuracy_samples, ece_samples, limit=TOPK_DICT[dataset], plot_kwargs=plot_kwargs) axes[idx // 3, idx % 3].set_title(DATASET_NAMES[dataset]) idx += 1 axes[0, 0].set_ylabel('ECE') axes[1, 0].set_ylabel('ECE') fig.set_size_inches(TEXT_WIDTH, 4.0) fig.subplots_adjust(bottom=0.05, wspace=0.2) fig.delaxes(axes.flatten()[5]) figname = FIGURE_DIR + 'scatter.pdf' fig.tight_layout() fig.savefig(figname, bbox_inches='tight', pad_inches=0)
def get_bayesian_ground_truth(categories: List[int], observations: List[bool], confidences: List[float], num_classes: int, metric: str, mode: str, topk: int = 1, pseudocount: int = 1, prior=None) -> np.ndarray: """ Compute ground truth given metric and mode with all data points. :param categories: List[int] A list of predicted classes. :param observations: List[bool] A list of boolean observations. :param confidences: List[float] A list of prediction scores. :param num_classes: int The number of classes. :param metric: str 'accuracy' or 'calibration_error' :param mode: str 'min' or max' :param topk: int The number of top classes to return. Default: 1. :param pseudocount: int Strength of prior for ClasswiseEce model. Default: 1. :param prior: np.ndarray Prior for BetaBernoulli model. Default: None. :return: binary np.ndarray of shape (num_classes, ) indicating each class in top k or not. """ if metric == 'accuracy': model = BetaBernoulli(num_classes, prior=prior) model.update_batch(confidences, observations) elif metric == 'calibration_error': model = ClasswiseEce(num_classes, num_bins=10, pseudocount=pseudocount) model.update_batch(categories, observations, confidences) metric_val = model.eval output = np.zeros((num_classes, ), dtype=np.bool_) if mode == 'max': indices = metric_val.argsort()[-topk:] else: indices = metric_val.argsort()[:topk] output[indices] = 1 return output
def thompson_sampling(deques: List[deque], model: BetaBernoulli, mode: str, topk: int = 1, **kwargs) -> Union[int, List[int]]: """ Draw topk samples with Thompson sampling. :param deques: List[deque] A list of deques, each contains a deque of samples from one predicted class. :param model: BetaBernoulli A model for classwise accuracy. :param mode: str 'min' or 'max' :param topk: int The number of extreme classes to identify. Default: 1. :param kwargs: :return: Union[int, List[int]] A list of index if topk > 1 and topk < number of non-empty deques; else return one index. """ samples = model.sample() if mode == 'max': ranked = np.argsort(samples)[::-1] elif mode == 'min': ranked = np.argsort(samples) if topk == 1: for category in ranked: if len(deques[category]) != 0: return category else: categories_list = [] candidates = set([i for i in range(len(deques)) if len(deques[i]) > 0]) # when we go through 'ranked' and len(categories_list) < topk, topk sampling is reduced to top 1 if len(candidates) < topk: return thompson_sampling(deques, model, mode, topk=1) else: for category in ranked: if category in candidates: categories_list.append(category) if len(categories_list) == topk: return categories_list
def get_samples_topk( args: argparse.Namespace, categories: List[int], observations: List[bool], confidences: List[float], labels: List[int], indices: List[int], num_classes: int, num_samples: int, sample_method: str, prior=None, weight=None, random_seed: int = 0) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: # prepare model, deques, thetas, choices random.seed(random_seed) if args.metric == 'accuracy': model = BetaBernoulli(num_classes, prior) elif args.metric == 'calibration_error': model = ClasswiseEce(num_classes, num_bins=10, weight=weight, prior=None) deques = [deque() for _ in range(num_classes)] for category, score, observation, label, index in zip( categories, confidences, observations, labels, indices): if args.metric == 'accuracy': deques[category].append(observation) elif args.metric == 'calibration_error': deques[category].append((observation, score, label, index)) for _deque in deques: random.shuffle(_deque) sampled_categories = np.zeros((num_samples, ), dtype=np.int) sampled_observations = np.zeros((num_samples, ), dtype=np.int) sampled_scores = np.zeros((num_samples, ), dtype=np.float) sampled_labels = np.zeros((num_samples, ), dtype=np.int) sampled_indices = np.zeros((num_samples, ), dtype=np.int) sample_fct = SAMPLE_CATEGORY[sample_method] topk = args.topk idx = 0 while idx < num_samples: # sampling process: # if there are less than k available arms to play, switch to top 1, the sampling method has been switched to top1, # then the return 'category_list' is an int categories_list = sample_fct( deques=deques, random_seed=random_seed, model=model, mode=args.mode, topk=topk, max_ttts_trial=50, ttts_beta=0.5, epsilon=0.1, ucb_c=1, ) if type(categories_list) != list: categories_list = [categories_list] if topk != 1: topk = 1 # update model, deques, thetas, choices for category in categories_list: if args.metric == 'accuracy': observation = deques[category].pop() model.update(category, observation) elif args.metric == 'calibration_error': observation, score, label, index = deques[category].pop() model.update(category, observation, score) sampled_scores[idx] = score sampled_labels[idx] = label sampled_indices[idx] = index sampled_categories[idx] = category sampled_observations[idx] = observation idx += 1 return sampled_categories, sampled_observations, sampled_scores, sampled_labels, sampled_indices
def eval( args: argparse.Namespace, categories: List[int], observations: List[bool], confidences: List[float], labels: List[int], indices: List[int], ground_truth: np.ndarray, num_classes: int, holdout_categories: List[ int] = None, # will be used if train classwise calibration model holdout_observations: List[bool] = None, holdout_confidences: List[float] = None, holdout_labels: List[int] = None, holdout_indices: List[int] = None, prior=None, weight=None) -> Tuple[np.ndarray, ...]: """ :param args: :param categories: :param observations: :param confidences: :param labels: :param indices: :param ground_truth: :param num_classes: :param holdout_categories: :param holdout_observations: :param holdout_confidences: :param holdout_labels: :param holdout_indices: :param prior: :param weight: :return avg_num_agreement: (num_samples, ) array. Average number of agreement between selected topk and ground truth topk at each step. :return cumulative_metric: (num_samples, ) array. Metric (accuracy or ece) measured on sampled_observations, sampled categories and sampled scores. :return non_cumulative_metric: (num_samples, ) array. Average metric (accuracy or ece) evaluated with model.eval of the selected topk arms at each step. """ num_samples = len(categories) if args.metric == 'accuracy': model = BetaBernoulli(num_classes, prior) elif args.metric == 'calibration_error': model = ClasswiseEce(num_classes, num_bins=10, weight=weight, prior=None) avg_num_agreement = np.zeros((num_samples // LOG_FREQ + 1, )) cumulative_metric = np.zeros((num_samples // LOG_FREQ + 1, )) non_cumulative_metric = np.zeros((num_samples // LOG_FREQ + 1, )) if args.metric == 'calibration_error': holdout_calibrated_ece = np.zeros( (num_samples // CALIBRATION_FREQ + 1, )) if args.calibration_model in [ 'histogram_binning', 'isotonic_regression', 'bayesian_binning_quantiles' ]: holdout_X = np.array(holdout_confidences) holdout_X = np.array([1 - holdout_X, holdout_X]).T elif args.calibration_model in [ 'platt_scaling', 'temperature_scaling' ]: holdout_indices_array = np.array(holdout_indices, dtype=np.int) with process_lock: holdout_X = logits[holdout_indices_array] topk_arms = np.zeros((num_classes, ), dtype=np.bool_) for idx, (category, observation, confidence, label, index) in enumerate( zip(categories, observations, confidences, labels, indices)): if args.metric == 'accuracy': model.update(category, observation) elif args.metric == 'calibration_error': model.update(category, observation, confidence) if idx % LOG_FREQ == 0: # select TOPK arms topk_arms[:] = 0 metric_val = model.eval if args.mode == 'min': topk_indices = metric_val.argsort()[:args.topk] elif args.mode == 'max': topk_indices = metric_val.argsort()[-args.topk:] topk_arms[topk_indices] = 1 # evaluation avg_num_agreement[idx // LOG_FREQ] = topk_arms[ground_truth == 1].mean() # todo: each class is equally weighted by taking the mean. replace with frequency.(?) cumulative_metric[idx // LOG_FREQ] = model.frequentist_eval.mean() non_cumulative_metric[idx // LOG_FREQ] = metric_val[topk_arms].mean() if args.metric == 'calibration_error' and idx % CALIBRATION_FREQ == 0: # before calibration if idx == 0: holdout_calibrated_ece[idx] = eval_ece(holdout_confidences, holdout_observations, num_bins=10) else: calibration_model = CALIBRATION_MODELS[ args.calibration_model]() if args.calibration_model in [ 'histogram_binning', 'isotonic_regression', 'bayesian_binning_quantiles' ]: X = np.array(confidences[:idx]) X = np.array([1 - X, X]).T y = np.array(observations[:idx]) * 1 calibration_model.fit(X, y) calibrated_holdout_confidences = calibration_model.predict_proba( holdout_X)[:, 1].tolist() elif args.calibration_model in [ 'platt_scaling', 'temperature_scaling' ]: X = logits[indices[:idx]] y = np.array(labels[:idx], dtype=np.int) calibration_model.fit(X, y) pred_array = np.array(holdout_categories).astype( int).reshape(-1, 1) calibrated_holdout_confidences = calibration_model.predict_proba( holdout_X) calibrated_holdout_confidences = np.take_along_axis( calibrated_holdout_confidences, pred_array, axis=1).squeeze().tolist() holdout_calibrated_ece[idx // CALIBRATION_FREQ] = eval_ece( calibrated_holdout_confidences, holdout_observations, num_bins=10) with process_lock: logger.debug(holdout_calibrated_ece) if args.metric == 'accuracy': return avg_num_agreement, cumulative_metric, non_cumulative_metric elif args.metric == 'calibration_error': return avg_num_agreement, cumulative_metric, non_cumulative_metric, holdout_calibrated_ece
def evaluate(args: argparse.Namespace, categories: List[int], observations: List[bool], confidences: List[float], labels: List[int], indices: List[int], ground_truth: np.ndarray, num_classes: int, holdout_categories: List[int] = None, # will be used if train classwise calibration model holdout_observations: List[bool] = None, holdout_confidences: List[float] = None, holdout_labels: List[int] = None, holdout_indices: List[int] = None, prior=None, weight=None, logits=None) -> Tuple[np.ndarray, ...]: """ Evaluate topk ground truth agains predictions made by the model, which is trained on actively or non-actively selected samples. :return avg_num_agreement: (num_samples // LOG_FREQ, ) array. Average number of agreement between selected topk and ground truth topk at each step. :return holdout_calibrated_ece: (num_samples // CALIBRATION_FREQ , ) array. ECE evaluated on recalibrated holdout set. :return mrr: (num_samples // LOG_FREQ, ) array. MRR of ground truth topk at each step. """ num_samples = len(categories) if args.metric == 'accuracy': model = BetaBernoulli(num_classes, prior) elif args.metric == 'calibration_error': model = ClasswiseEce(num_classes, num_bins=10, pseudocount=args.pseudocount, weight=weight) avg_num_agreement = np.zeros((num_samples // LOG_FREQ + 1,)) mrr = np.zeros((num_samples // LOG_FREQ + 1,)) if args.metric == 'calibration_error': holdout_calibrated_ece = np.zeros((num_samples // CALIBRATION_FREQ + 1,)) if args.calibration_model in ['histogram_binning', 'isotonic_regression', 'bayesian_binning_quantiles', 'classwise_histogram_binning', 'two_group_histogram_binning']: holdout_X = np.array(holdout_confidences) holdout_X = np.array([1 - holdout_X, holdout_X]).T elif args.calibration_model in ['platt_scaling', 'temperature_scaling']: holdout_indices_array = np.array(holdout_indices, dtype=np.int) holdout_X = logits[holdout_indices_array] topk_arms = np.zeros((num_classes,), dtype=np.bool_) for idx, (category, observation, confidence, label, index) in enumerate( zip(categories, observations, confidences, labels, indices)): if args.metric == 'accuracy': model.update(category, observation) elif args.metric == 'calibration_error': model.update(category, observation, confidence) if idx % LOG_FREQ == 0: # select TOPK arms topk_arms[:] = 0 metric_val = model.eval if args.mode == 'min': topk_indices = metric_val.argsort()[:args.topk] elif args.mode == 'max': topk_indices = metric_val.argsort()[-args.topk:] topk_arms[topk_indices] = 1 # evaluation avg_num_agreement[idx // LOG_FREQ] = topk_arms[ground_truth == 1].mean() # MRR mrr[idx // LOG_FREQ] = mean_reciprocal_rank(metric_val, ground_truth, args.mode) ########RECALIBRATION############# if args.metric == 'calibration_error' and idx % CALIBRATION_FREQ == 0: # before calibration if idx == 0: holdout_calibrated_ece[idx] = eval_ece(holdout_confidences, holdout_observations, num_bins=10) else: if args.calibration_model in ['histogram_binning', 'isotonic_regression', 'bayesian_binning_quantiles']: calibration_model = CALIBRATION_MODELS[args.calibration_model]() X = np.array(confidences[:idx]) X = np.array([1 - X, X]).T y = np.array(observations[:idx]) * 1 calibration_model.fit(X, y) calibrated_holdout_confidences = calibration_model.predict_proba(holdout_X)[:, 1].tolist() elif args.calibration_model in ['platt_scaling', 'temperature_scaling']: calibration_model = CALIBRATION_MODELS[args.calibration_model]() X = logits[indices[:idx]] y = np.array(labels[:idx], dtype=np.int) calibration_model.fit(X, y) pred_array = np.array(holdout_categories).astype(int).reshape(-1, 1) calibrated_holdout_confidences = calibration_model.predict_proba(holdout_X) calibrated_holdout_confidences = np.take_along_axis(calibrated_holdout_confidences, pred_array, axis=1).squeeze().tolist() elif args.calibration_model in ['classwise_histogram_binning']: # use the current MPE reliability diagram for calibration, no need to train a separate calibration model calibration_mapping = model.beta_params_mpe bin_idx = np.floor(np.array(holdout_confidences) * 10).astype(int) bin_idx[bin_idx == 10] = 9 calibrated_holdout_confidences = calibration_mapping[holdout_categories, bin_idx].tolist() elif args.calibration_model in ['two_group_histogram_binning']: calibrated_holdout_confidences = np.zeros(len(holdout_confidences)) calibration_model_less_calibrated = CALIBRATION_MODELS['histogram_binning']() calibration_model_more_calibrated = CALIBRATION_MODELS['histogram_binning']() X = np.array(confidences[:idx]) X = np.array([1 - X, X]).T y = np.array(observations[:idx]) * 1 train_mask = np.array([ground_truth[val] for val in categories[:idx]]) holdout_mask = np.array([ground_truth[val] for val in holdout_categories]) calibration_model_less_calibrated.fit(X[train_mask], y[train_mask]) calibration_model_more_calibrated.fit(X[np.invert(train_mask)], y[np.invert(train_mask)]) calibrated_holdout_confidences[holdout_mask] = calibration_model_less_calibrated.predict_proba( holdout_X[holdout_mask])[:, 1] calibrated_holdout_confidences[ np.invert(holdout_mask)] = calibration_model_more_calibrated.predict_proba( holdout_X[np.invert(holdout_mask)])[:, 1] calibrated_holdout_confidences = calibrated_holdout_confidences.tolist() else: raise ValueError("%s is not an implemented calibration method." % args.calibration_model) holdout_calibrated_ece[idx // CALIBRATION_FREQ] = eval_ece(calibrated_holdout_confidences, holdout_observations, num_bins=10) if args.metric == 'accuracy': return avg_num_agreement, mrr elif args.metric == 'calibration_error': return avg_num_agreement, holdout_calibrated_ece, mrr