def compute_val(): loss_function = nn.CrossEntropyLoss() with torch.no_grad(): model.eval() y_pred = [] output_prob_val = [] output_logits_val = [] y_val_hard = [] for sent, label in validating_loader: y_val_hard.append(label.item()) sent = sent.squeeze(0) if torch.cuda.is_available(): sent = sent.cuda() label = label.cuda() output = model.forward(sent)[0] logit, predicted = torch.max(output.data, 1) output_logits_val.append(output[0].cpu().tolist()) output_prob_val.append(torch.sigmoid(output[0]).cpu().tolist()) y_pred.append(predicted.item()) loss_val = loss_function(torch.Tensor(output_logits_val), torch.LongTensor(y_val_hard)).item() model.train() # compute and plot ECE ece_val = ece_score(np.array(y_val_hard), np.array(output_prob_val)) n_bins = 10 title_suffix = '' diagram = ReliabilityDiagram(n_bins) diagram.plot(np.array(output_prob_val), np.array(y_val_hard), title_suffix) # plt.savefig(title_suffix + '.pdf') # check if binary or multi class classification num_classes = len(set(y_val_hard)) if num_classes == 2: average = 'binary' else: average = 'macro' pre_val, rec_val, f1_val, _ = precision_recall_fscore_support( y_val_hard, y_pred, average=average, beta=1) _, _, f01_val, _ = precision_recall_fscore_support(y_val_hard, y_pred, average=average, beta=0.1) _, _, f10_val, _ = precision_recall_fscore_support(y_val_hard, y_pred, average=average, beta=10) print( 'Iteration: {}. Train Loss: {:1.5f}. Test Loss: {:1.5f}, F1: {:1.3f}, ECE: {:1.3f}, Precision: {:1.3f}, Recall: {:1.3f}' .format(i, loss.item(), loss_val, f1_val, ece_val, pre_val, rec_val)) # print to result file with open(res_path, 'w') as f: c = 'epoch, iter, loss_train, loss_test, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test' f.write(c + '\n') res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format( epoch, i, loss.item(), loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val) f.write(res_i)
def plot_results(bins: Union[tuple, list, int], data: dict, methods0d: list, methods2d: list): """ Plot results as reliability diagrams (either 0D or 2D). Parameters ---------- bins : iterable or int Number of bins used by ACE, ECE and MCE data : dict Dictionary of calibration data. methods0d : list List with strings containing the keys for the calibration data (confidence only methods). methods2d : list List with strings containing the keys for the calibration data (2D methods). """ for i, methods in enumerate([methods0d, methods2d]): # insert 'confidence' key to the first place in the list to keep track of default miscalibration methods = ['confidence'] + methods # on confidence only, use one single value (the first one) bins = bins[0] if i == 0 and isinstance(bins, (tuple, list)) else bins # iterate over all calibration models and plot reliability diagram for method in methods: diagram = ReliabilityDiagram(bins, detection=True, title_suffix=method) fig = diagram.plot(data[method], data['matched']) # -------------------------------------------- # second, plot 2D reliability diagrams as heatmaps for method in methods: data_input = np.stack((data[method], data['cx'], data['cy']), axis=1) diagram = ReliabilityDiagram(bins, detection=True, feature_names=['cx', 'cy'], fmin=0.0, fmax=0.3, title_suffix=method) fig = diagram.plot(data_input, data['matched']) plt.show()
def script(): ''' LOAD DATA ''' data_path = '../data/' train_data = pd.read_csv(data_path + 'v3.2.2_train.csv') test_data = pd.read_csv(data_path + 'v3.2.2_test.csv') y_train, y_test = train_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']], \ test_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']] X_train, X_test = train_data.drop( ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1), \ test_data.drop( ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1) model = load_model("../network/models/v3.2.2/model.h5") print("# of train samples: ", len(y_train.index)) print("# of test samples: ", len(y_test.index)) ##Using NetCal package n_bins = 10 confidences = model.predict(X_test.values) ece = ECE(n_bins) uncalibrated_score = ece.measure(confidences, y_test.values.argmax(axis=1)) print("Calibration Error before calibration: ", uncalibrated_score) temperature = TemperatureScaling() temperature.fit(confidences, y_test.values.argmax(axis=1)) calibrated = temperature.transform(confidences) ece = ECE(n_bins) calibrated_score = ece.measure(calibrated, y_test.values.argmax(axis=1)) print("Calibration Error after calibration: ", calibrated_score) diagram = ReliabilityDiagram(n_bins) diagram.plot(confidences, y_test.values.argmax( axis=1)) # visualize miscalibration of uncalibrated diagram.plot(calibrated, y_test.values.argmax( axis=1)) # visualize miscalibration of calibrated np.savetxt('./calibration-data/test_calibrated_v3.2.2.csv', calibrated, delimiter=',')
lw=1, color='red') ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="upper left") ax1.set_title('Calibration plots (reliability curve)') ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) plt.tight_layout() #Tempreture scaling for probability calibration using netcal package from netcal.scaling import TemperatureScaling temperature = TemperatureScaling() temperature.fit(y_prob, y_all) calibrated = temperature.transform(y_prob) #Computing the expected calibration error from netcal.metrics import ECE from netcal.presentation import ReliabilityDiagram n_bins = 10 ece = ECE(n_bins) uncalibrated_score = ece.measure(y_new, y_test) calibrated_score = ece.measure(calibrated, y_test) diagram = ReliabilityDiagram(n_bins) diagram.plot(y_new, y_test) # visualize miscalibration of uncalibrated diagram.plot(calibrated, y_test) # visualize miscalibration of calibrated
def plot_reliability_diagram(y_true, y_prob, n_bins=10, title_suffix=''): diagram = ReliabilityDiagram(n_bins) diagram.plot(y_prob, y_true, title_suffix)
def single_example(models: list, datafile: str, bins: int, diagram: str = None, validation_split: float = 0.7, save_models: bool = False, domain: str = ".") -> int: """ Measure miscalibration of given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. diagram : str, optional, default: None Type of diagram wich should be plotted. This could be 'diagram', 'curve', 'inference' or None. validation_split : float Split ratio between build set and validation set. save_models : bool True if instances of calibration methods should be stored. domain : str, optional, default: "." Domain/directory where to store the results. Returns ------- int 0 on success, -1 otherwise """ if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, test_size=validation_split, stratify=ground_truth, random_state=None) # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) predictions = [] all_ace = [ace.measure(validation_set_sm, validation_set_gt)] all_ece = [ece.measure(validation_set_sm, validation_set_gt)] all_mce = [mce.measure(validation_set_sm, validation_set_gt)] # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("%s/models/%s.pkl" % (domain, name)) # ------------------------------------------ # perform predictions for model in models: _, instance = model prediction = instance.transform(validation_set_sm) predictions.append(prediction) all_ace.append(ace.measure(prediction, validation_set_gt)) all_ece.append(ece.measure(prediction, validation_set_gt)) all_mce.append(mce.measure(prediction, validation_set_gt)) # ------------------------------------------ # output formatted ECE names = [len(x[0]) for x in models] buffer = max(names) fill = (buffer - len("Default")) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_ace[0], all_ece[0], all_mce[0])) for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_ace[i], all_ece[i], all_mce[i])) # ------------------------------------------ if diagram == 'diagram': diagram = ReliabilityDiagram(bins=bins, title_suffix="default") diagram.plot(validation_set_sm, validation_set_gt, filename="test.png") for i, prediction in enumerate(predictions): diagram = ReliabilityDiagram(bins=bins, title_suffix=models[i][0]) diagram.plot(prediction, validation_set_gt) elif diagram is None: pass else: print("Unknown diagram type \'%s\'" % diagram) return -1 return 0
def evaluate( annotations, results, iou=0.75, iou_type="segm", dataset="lvis", n_bins=10, commercial_only=False, subset=1.0, seed=0.0, min_score=0.0, vis_dir=None, vis_per_class=False, max_dets=300, max_dets_per_class=-1, ): """ Args: annotations (str, Path, or dict): Path to COCO/LVIS-style annotations, or dict containing the annotations. results (str, Path, or dict): Path to COCO/LVIS-style results, or dict containing the results. iou (float): IoU threshold to evaluate calibration at. iou_type (str): segm or bbox dataset (str): lvis or coco n_bins (int): Number of bins for calibration eval commercial_only (bool): Use only commercial images for COCO. Used to match Küppers et al. setting. subset (float): If <1.0, use a random subset of this portion for eval. seed (float): Used to seed the rng for subset selection. min_score (float): If specified, ignore detections below this threshold for calibration evaluation. This flag does not affect the AP calculation. This should generally be left at 0, but can be set to 0.3 to match the Küppers et al. setting. vis_dir (str, Path, or None): If specified, output reliability diagrams to this directory. vis_per_class (bool): If vis_dir is specified and vis_per_class is True, output a reliability diagram for each class. max_dets (int): Limit number of detections per image. max_dets_per_class (int): Limit number of detections per class. """ if vis_dir is not None: vis_dir = Path(vis_dir) plotter = ReliabilityDiagram(bins=n_bins, detection=True, metric="ECE") else: plotter = None rng = random.Random(seed) eval_wrapper = EvalWrapper( annotations, results, dataset_type=dataset, ious=[iou], iou_type=iou_type, max_dets=max_dets, max_dets_per_class=max_dets_per_class, ) eval_obj = eval_wrapper.construct_eval(use_cats=True) is_lvis = eval_wrapper.is_lvis() params = eval_obj.params gt = eval_obj.lvis_gt if is_lvis else eval_obj.cocoGt if commercial_only: # Licenses 1, 2, 3 are NonCommercial valid_licenses = {4, 5, 6, 7, 8} orig_img_ids = params.img_ids if is_lvis else params.imgIds img_ids = [ i for i in orig_img_ids if gt.imgs[i]["license"] in valid_licenses ] logging.info( f"Selecting {len(img_ids)}/{len(orig_img_ids)} commercial images.") if is_lvis: params.img_ids = img_ids else: params.imgIds = img_ids if subset < 1.0: img_ids = params.img_ids if is_lvis else params.imgIds k = int(round(len(img_ids) * subset)) logging.info(f"Selecting {k}/{len(img_ids)} images randomly.") rng.shuffle(img_ids) if is_lvis: params.img_ids = img_ids[:k] else: params.imgIds = img_ids[:k] eval_obj.evaluate() # True positive set true_positives, false_positives, missed_gt = load_tp_fp_fn(eval_obj) eval_obj.accumulate() eval_obj.summarize() # Map class id to list of (detection: dict, is_matched: bool) class_dets = defaultdict(list) for dt_id in true_positives: ann = eval_wrapper.results.anns[dt_id] class_dets[ann["category_id"]].append((ann, True)) for dt_id in false_positives: ann = eval_wrapper.results.anns[dt_id] class_dets[ann["category_id"]].append((ann, False)) if min_score > 0.0: class_dets = { c: [x for x in dets if x[0]["score"] > min_score] for c, dets in class_dets.items() } # Remove empty classes. class_dets = {c: v for c, v in class_dets.items() if v} # Map class id to tuple of (scores, is_matched) scores_matched = { c: ( np.array([d["score"] for d, _ in dets])[:, np.newaxis], # scores, (n, 1) np.array([m for _, m in dets])[:, np.newaxis], # is_matched, (n, 1) ) for c, dets in class_dets.items() } classes = sorted(scores_matched.keys()) all_scores = np.vstack([scores_matched[c][0] for c in classes]) all_is_matched = np.vstack([scores_matched[c][1] for c in classes]) ece = ECE([n_bins], detection=True) output_metrics = {} output_metrics["AP"] = eval_obj.results["AP"] if is_lvis: for f in ("f", "c", "r"): output_metrics[f"AP{f}"] = eval_obj.results[f"AP{f}"] output_metrics["ece-overall"] = ece.measure(all_scores, all_is_matched) if plotter: fig = plotter.plot(all_scores, all_is_matched, filename=vis_dir / f"overall.pdf") plt.close(fig) # NOTE: Skips classes with no predictions nor groundtruth; Assigns ECE of 1.0 for # classes with groundtruth but no predictions. per_class_eces = {} predicted_classes = set(scores_matched.keys()) missed_classes = {gt.anns[g]["category_id"] for g in missed_gt} for cid in missed_classes | predicted_classes: if cid not in predicted_classes: # Present but not predicted # Skip class from calibration error. continue else: scores, is_matched = scores_matched[cid] per_class_eces[cid] = ece.measure(scores, is_matched) if plotter and vis_per_class: cname = gt.cats[cid].get("synset", gt.cats[cid]["name"]) fig = plotter.plot(scores, is_matched, filename=vis_dir / f"class-{cid}-{cname}.pdf") plt.close(fig) output_metrics["ece-per-class"] = np.mean(list(per_class_eces.values())) if eval_wrapper.is_lvis(): # Map frequency to category ids (eval_obj.freq_groups maps to indices) for f, indices in enumerate(eval_obj.freq_groups): freq = eval_obj.params.img_count_lbl[f] cat_ids = [eval_obj.params.cat_ids[i] for i in indices] cat_ids = [c for c in cat_ids if c in scores_matched] freq_scores = np.vstack([scores_matched[c][0] for c in cat_ids]) freq_matched = np.vstack([scores_matched[c][1] for c in cat_ids]) output_metrics[f"ece-freq-{freq}"] = ece.measure( freq_scores, freq_matched) output_metrics[f"ece-per-class-{freq}"] = np.mean( [per_class_eces[c] for c in cat_ids if c in per_class_eces]) if plotter: fig = plotter.plot(freq_scores, freq_matched, filename=vis_dir / f"freq-{freq}.pdf") plt.close(fig) return output_metrics
def transform(frames: List[Dict], dataset: str, network: str, subset: List, ious: List, test_ids: List[int]): """ After calibration training, evaluate the trained models by several miscalibration metrics. These metrics are: D-ECE, Brier, NLL. Also capture area under precision-recall curve (AUPRC). All results are stored at "./output/<network>". Parameters ---------- frames : List[Dict] List of dictionaries holding the input data for each image frame. dataset : str String of the used dataset (see detectron2 registered datasets). network : str String describing the base neural network. subset : List[str] List with additional features used for calibration. Options are: - 'cx' - 'cy' - 'w' - 'h' ious : List[float] List with IoU scores used for evaluation. test_ids : List List of data frame ids used for calibration testing. """ # get meta information and specify all relevant paths meta = MetadataCatalog.get(dataset) model_dir = os.path.join("calibration", network, "models") output_dir = os.path.join("output", network) diagram_path = os.path.join( output_dir, "diagrams", ''.join(subset) if len(subset) > 0 else "confidence") os.makedirs(output_dir, exist_ok=True) os.makedirs(diagram_path, exist_ok=True) # calibration methods that have also been used for calibration training methods = [("histogram", HistogramBinning), ("lr", LogisticCalibration), ("lr_dependent", LogisticCalibrationDependent), ("betacal", BetaCalibration), ("betacal_dependent", BetaCalibrationDependent)] # reverse mapping of category ids to network class ids (e.g. for COCO dataset) if hasattr(meta, "thing_dataset_id_to_contiguous_id"): reverse_dictionary = { v: k for k, v in meta.thing_dataset_id_to_contiguous_id.items() } else: reverse_dictionary = None # lists and placeholders for evaluation metrics n_samples_total = 0 n_samples_per_class = [] dece_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] brier_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] nll_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] average_precision = [[[] for _ in ious] for _ in range(len(methods) + 1)] # ----------------------------------------------------- # visualization routine diagram0d = ReliabilityDiagram(bins=20, detection=True, sample_threshold=8) diagram1d = ReliabilityDiagram(bins=[5, 15], detection=True, sample_threshold=3, fmin=0, fmax=0.3) diagram2d = ReliabilityDiagram(bins=[6, 9, 9], detection=True, sample_threshold=2, fmin=0, fmax=0.3) def plot(f: np.ndarray, m: np.ndarray, title: str, formatter: str): # Define function for diagram output # plot baseline miscalibration figures = [ diagram0d.plot(f[:, :1], m, tikz=False, title_suffix=title, filename=formatter % "0d") ] # plot all additional features in 1D miscalibration plots for i, fname in enumerate(['cx', 'cy', 'w', 'h']): figures.append( diagram1d.plot(f[:, (0, i + 1)], m, tikz=False, feature_names=[fname], title_suffix=title, filename=formatter % ("1d_%s" % fname))) # finally, plot all feature combinations of size 2 for (i, fname1), (j, fname2) in itertools.combinations( enumerate(['cx', 'cy', 'w', 'h']), 2): figures.append( diagram2d.plot(f[:, (0, i + 1, j + 1)], m, tikz=False, feature_names=[fname1, fname2], title_suffix=title, filename=formatter % ("2d_%s_%s" % (fname1, fname2)))) # free memory space for fig in figures: plt.close(fig) # ----------------------------------------------------- # iterate over all classes that are present in the current dataset for i, classname in enumerate(meta.thing_classes): # get calibration features for selected class category_id = reverse_dictionary[ i] if reverse_dictionary is not None else i features, matched, img_ids = get_features(frames, category_id, subset, ious, test_ids) all_features, _, _ = get_features(frames, category_id, ['cx', 'cy', 'w', 'h'], ious, test_ids) if features.size == 0: print("No samples for category %s found" % classname) continue # different binning schemes for different feature dimensions if features.shape[1] == 1: bins = 20 elif features.shape[1] == 3: bins = 8 elif features.shape[1] == 5: bins = 5 else: raise ValueError("Unknown dimension: %d" % features.shape[1]) # define D-ECE metric dece = ECE(bins=bins, detection=True, sample_threshold=8) n_samples_per_class.append(features.shape[0]) n_samples_total += features.shape[0] # failed flag is required to optionally blank failed or non-present classes during evaluation # i.e., if a metric returns NaN failed = False # perform evaluation for each category separately print("Inference: category %d: %d samples" % (category_id, features.shape[0])) for j, (iou, m) in enumerate(zip(ious, matched)): score = average_precision_score(m, features[:, 0]) if not np.isfinite(score) or np.isnan(score): brier_per_class[0][j].append(0.) nll_per_class[0][j].append(0.) dece_per_class[0][j].append(0.) average_precision[0][j].append(0.) failed = True # compute average precision, Brier, NLL and ECE else: brier_per_class[0][j].append( np.mean(np.square(features[:, 0] - m))) nll_per_class[0][j].append( -np.mean(m * np.log(features[:, 0]) + (1. - m) * np.log(1. - features[:, 0]))) dece_per_class[0][j].append(dece.measure(features, m)) average_precision[0][j].append(score) diagramname = os.path.join( diagram_path, "default_cls-%02d_iou%.2f" % (i, iou) + "_%s.tex") plot(all_features, m, title="default", formatter=diagramname) # start calibration evaluation for each method separately for k, (name, method) in enumerate(methods, start=1): instance = method() try: print("Load %s and transform" % name) instance.load_model( os.path.join( model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" % (name, ''.join(subset), iou, i))) calibrated = instance.transform(features) # perform clipping np.clip(calibrated, np.finfo(np.float32).eps, 1. - np.finfo(np.float32).eps, out=calibrated) score = average_precision_score(m, calibrated) if not np.isfinite(score) or np.isnan(score): raise ValueError("Couldn't compute AUPRC score") average_precision[k][j].append(score) brier_per_class[k][j].append( np.mean(np.square(calibrated - m))) nll_per_class[k][j].append( -np.mean(m * np.log(calibrated) + (1. - m) * np.log(1. - calibrated))) input = np.concatenate( (np.reshape(calibrated, (-1, 1)), features[:, 1:]), axis=1) dece_per_class[k][j].append(dece.measure(input, m)) diagramname = os.path.join( diagram_path, "%s_cls-%02d_iou%.2f" % (name, i, iou) + "_%s.tex") input = np.concatenate( (np.reshape(calibrated, (-1, 1)), all_features[:, 1:]), axis=1) plot(input, m, title=name, formatter=diagramname) except (FileNotFoundError, ValueError): print( "Could not find weight file ", os.path.join( model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" % (name, ''.join(subset), iou, i))) print("Disable evaluation for class %d" % i) brier_per_class[k][j].append(0.) nll_per_class[k][j].append(0.) dece_per_class[k][j].append(0.) average_precision[k][j].append(0.) failed = True if failed: n_samples_total -= n_samples_per_class[-1] n_samples_per_class[-1] = 0 # convert all lists to NumPy arrays weights = np.array(n_samples_per_class) / n_samples_total brier_per_class = np.array(brier_per_class) nll_per_class = np.array(nll_per_class) dece_per_class = np.array(dece_per_class) average_precision = np.array(average_precision) # compute a feed-forward average and and a weighted counter-part brier_global = np.mean(brier_per_class, axis=2) weighted_brier_global = np.average(brier_per_class, weights=weights, axis=2) nll_global = np.mean(nll_per_class, axis=2) weighted_nll_global = np.average(nll_per_class, weights=weights, axis=2) dece_global = np.mean(dece_per_class, axis=2) weighted_dece_global = np.average(dece_per_class, weights=weights, axis=2) average_precision_macro = np.mean(average_precision, axis=2) average_precision_weighted = np.average(average_precision, weights=weights, axis=2) # use tabulate library to visualize the evaluation results header = [] body = [['default']] body.extend([[name] for name, method in methods]) for i, iou in enumerate(ious): header.extend([ 'D-ECE(w) @ IoU %.2f' % iou, 'D-ECE @ IoU %.2f' % iou, 'Brier(w) @ IoU %.2f' % iou, 'Brier @ IoU %.2f' % iou, 'NLL(w) @ IoU %.2f' % iou, 'NLL @ IoU %.2f' % iou, 'AP(w) @ IoU %.2f' % iou, 'AP @ IoU %.2f' ]) body[0].extend([ weighted_dece_global[0][i], dece_global[0][i], weighted_brier_global[0][i], brier_global[0][i], weighted_nll_global[0][i], nll_global[0][i], average_precision_weighted[0][i], average_precision_macro[0][i] ]) for k, (name, method) in enumerate(methods): body[k + 1].extend([ weighted_dece_global[k + 1][i], dece_global[k + 1][i], weighted_brier_global[k + 1][i], brier_global[k + 1][i], weighted_nll_global[k + 1][i], nll_global[k + 1][i], average_precision_weighted[k + 1][i], average_precision_macro[k + 1][i] ]) results = [header, *body] # also write the evaluation results to CSV format print("\nEvaluation Results:") print(tabulate(results, headers="firstrow")) with open(os.path.join(output_dir, "results_%s.csv" % ''.join(subset)), "w") as open_file: writer = csv.writer(open_file) writer.writerow([ "method", ] + results[0]) writer.writerows(results[1:])