def f1_per_depth(dist_gold: List, dist_prediction: List, max_depth: int): """ Find at which depth prediction mismatches happen (when the output forms a tree) Args: dist_gold (List): gold answer per essay dist_prediction (List): predicted answer per essay max_depth (int): max structure depth in the dataset Returns: tuple, i.e., (list, list, list) """ gold_all_depth = [] pred_all_depth = [] for i in range(len(dist_gold)): rep_gold = TreeBuilder(dist_gold[i]) rep_pred = TreeBuilder(dist_prediction[i]) if rep_pred.is_tree(): g_depths = rep_gold.node_depths() p_depths = rep_pred.node_depths() gold_all_depth.append(g_depths) pred_all_depth.append(p_depths) gold_all_depth_flat = flatten_list(gold_all_depth) pred_all_depth_flat = flatten_list(pred_all_depth) print("=== Depth prediction performance when output forms a tree ===") print( classification_report(y_true=gold_all_depth_flat, y_pred=pred_all_depth_flat, digits=3)) report = classification_report(y_true=gold_all_depth_flat, y_pred=pred_all_depth_flat, output_dict=True) f1s = [] for i in range(max_depth): try: f1s.append(report[str(i)]['f1-score']) except: f1s.append(0.0) return f1s