def get_cls_list(html_f): """ Given an html file, get a list of objects that's easier to reason about :param html_f: The input html file :return: [(cls, bb, score)] """ htmlfile2xml(html_f, '/tmp') return xml2list( f'{os.path.join("/tmp", os.path.basename(html_f)[:-5])}.xml')
def visualize_xml(xml_dir, img_dir, output_dir): for xml in glob.glob(os.path.join(xml_dir, '*.xml')): bname = os.path.basename(xml)[:-4] png_name = os.path.join(img_dir, bname + '.png') img = Image.open(png_name) img_np = np.array(img.convert('RGB')) xlist = xml2list(xml) x_coords_list = [x[1] for x in xlist] out_path = os.path.join(output_dir, bname + '.png') draw_cc(img_np, x_coords_list, write_img_p=out_path)
def convert_to_html(xml_f): xpath = os.path.join(xml, xml_f) l = xml2list(xpath) l = group_cls(l, 'Table', do_table_merge=True, merge_over_classes=[ 'Figure', 'Section Header', 'Page Footer', 'Page Header' ]) l = group_cls(l, 'Figure') pdf_name = FILE_NAME.search(f'{xml_f[:-4]}.png').group(1) list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'), html, unicodes[pdf_name] if pdf_name in unicodes else None)
def load_gt(xml_dir, identifier): """ Load an XML ground truth document :param xml_dir: base path to xml :param identifier: xml document identifier :return: [K x 4] Tensor, [cls_names] """ path = os.path.join(xml_dir, f"{identifier}.xml") as_lst = xml2list(path) if len(as_lst) == 0: cls_list = [0] tensor_list = [[0,0,0,0]] else: cls_list, tensor_list = zip(*as_lst) # convert to tensors gt_boxes = BBoxes(torch.tensor(tensor_list),"xyhw") return gt_boxes, cls_list
def load_data(input_dir, classes): features = [] targets = [] for f in glob.glob(os.path.join(input_dir, "html/*.html")): predict_list = process_html(f) target_path = os.path.splitext(os.path.basename(f))[0] target_path = os.path.join(input_dir, "target/{}.xml".format(target_path)) target_list = xml2list(target_path) list_map = match_lists(predict_list, target_list) for predict in predict_list: target = get_target(predict, list_map, classes) if target == -1: continue targets.append(target) features.append(get_feat_vec_train(predict, predict_list, classes)) return np.array(features), np.array(targets)
def ingest_file(path): """ Ingest an XML file to a dataframe :param path: path to XML file :return: dataframe of [id, label, x0,y0,x1,y1] """ lst = xml2list(path) labels = [item[0] for item in lst] coords = [item[1] for item in lst] scores = [float(item[2]) for item in lst] x0 = [coord[0] for coord in coords] y0 = [coord[1] for coord in coords] x1 = [coord[2] for coord in coords] y1 = [coord[3] for coord in coords] return pd.DataFrame({ "label": labels, "x0": x0, "x1": x1, "y0": y0, "y1": y1, "score": scores })
def convert_to_html(xpath, img_dir): l = xml2list(xpath) print(l) print(f'{os.path.basename(xpath)[:-4]}.png') list2html(l, f'{os.path.basename(xpath)[:-4]}.png', img_dir, 'html2')
def run_evaluate(predict_dir, target_dir, output_dir, img_dir=None, simi=False, thres=0): fp_list = [] classification_p_list = [] total_intersection = 0 total_prediction = 0 total_gt = 0 for predict_f in os.listdir(predict_dir): predict_path = os.path.join(predict_dir, predict_f) target_path = os.path.join(target_dir, predict_f) predict_list = xml2list(predict_path) target_list = xml2list(target_path) if img_dir is not None: img_p = os.path.join(img_dir, predict_f[:-4] + '.png') img = Image.open(img_p) for predict in predict_list: p_cls, p_bb, p_score = predict p_bb = [x - 5 for x in p_bb] d = ImageDraw.Draw(img) d.rectangle(p_bb, outline=color_classes[p_cls]) img.save(os.path.join(output_dir, f'{predict_f[:-4] + ".png"}')) list_map = match_lists(predict_list, target_list) tbb_map = {} for predict in predict_list: p_cls, p_bb, p_score = predict p_score = 0.1 p_bb = tuple(p_bb) matched_target = list_map[(p_cls, p_bb, p_score)] if matched_target is None: fp_list.append((predict, 'background')) continue t, iou = matched_target t_cls, t_bb, t_score = t t_bb = tuple(t_bb) #t_cls = ICDAR_convert[t_cls] if t_bb in tbb_map: tbb_map[t_bb].append(p_bb) else: tbb_map[t_bb] = [p_bb] if p_cls == t_cls: if iou < thres: fp_list.append((predict, 'localization')) continue fp_list.append((predict, 'correct')) classification_p_list.append((p_cls, t_cls)) continue else: if iou >= thres: classification_p_list.append((p_cls, t_cls)) sim = False for s in similar_class_sets: if p_cls in s and t_cls in s: sim = True break if sim: if simi: fp_list.append((predict, 'correct')) else: fp_list.append((predict, 'similar')) else: fp_list.append((predict, 'other')) page_intersection = 0 page_prediction = 0 page_gt = 0 for t_bb in tbb_map: for prediction in tbb_map[t_bb]: x_left = max(t_bb[0], prediction[0]) y_top = max(t_bb[1], prediction[1]) x_right = min(t_bb[2], prediction[2]) y_bottom = min(t_bb[3], prediction[3]) intersection_area = (x_right - x_left) * (y_bottom - y_top) page_intersection += intersection_area page_prediction += (prediction[2] - prediction[0]) * ( prediction[3] - prediction[1]) page_gt += (t_bb[2] - t_bb[0]) * (t_bb[3] - t_bb[1]) total_intersection += page_intersection total_prediction += page_prediction total_gt += page_gt print('Bounding box Precision') bb_precision = total_intersection / total_prediction print(bb_precision) print('--------') print('Bounding box Recall') bb_recall = total_intersection / total_gt print(bb_recall) print('--------') print('Bounding box F1') bb_f1 = 2 * bb_precision * bb_recall / (bb_precision + bb_recall) print(bb_f1) print('---------') class_counts = {} for p in classification_p_list: p_cls, t_cls = p if p_cls not in class_counts: class_counts[p_cls] = {} if t_cls in class_counts[p_cls]: class_counts[p_cls][t_cls] += 1 else: class_counts[p_cls][t_cls] = 1 class_precisions = {} all_tp = 0 all_denom = 0 for p_cls in class_counts: tp = 0 fp = 0 for t_cls in class_counts[p_cls]: if p_cls == t_cls: tp = class_counts[p_cls][t_cls] else: fp += class_counts[p_cls][t_cls] denom = tp + fp all_tp += tp all_denom += denom class_precisions[ p_cls] = tp / denom if denom != 0 else 'No false positives or true positives found' print('All class precision') all_precision = all_tp / all_denom print(all_precision) print('-----------------') all_tp = 0 all_denom = 0 class_recalls = {} print('DEBUG') for p_cls in class_counts: print(class_counts) tp = class_counts[p_cls][p_cls] if p_cls in class_counts[p_cls] else 0 fn = 0 for p2_cls in class_counts: if p2_cls == p_cls: continue if p_cls in class_counts[p2_cls]: fn += class_counts[p2_cls][p_cls] denom = tp + fn all_tp += tp all_denom += denom class_recalls[ p_cls] = tp / denom if denom != 0 else 'No false negatives or true positives found' print('All class recall') all_recall = all_tp / all_denom print(all_recall) print('--------------') print('All class F1') all_f1 = 2 * all_precision * all_recall / (all_precision + all_recall) print(all_f1) print('--------------') print('Class recalls') print(class_recalls) print('------------') print('Class precisions') print(class_precisions) print('------------') class_f1 = {} for cl in class_recalls: rec = class_recalls[cl] prec = class_precisions[cl] if type(rec) == str: print(f'Class: {cl}') print(rec) continue if rec + prec == 0: class_f1[cl] = 0 continue class_f1[cl] = 2 * rec * prec / (rec + prec) print('Class F1s') print(class_f1) print('-------------') print('Class counts') print(class_counts) df = pd.DataFrame(class_counts) df = df.fillna(value=0) df['Total'] = df.sum(axis=1) print(df[sorted(df.columns)]) print('------------') tp_num = 0 fp_num = 0 current_class = None roc_tp = [0] roc_fp = [0] p_r_curve = [] for p in fp_list: predict, category = p is_tp = category == 'correct' if is_tp: if current_class is None: current_class = True continue if not current_class: roc_tp.append(tp_num) roc_fp.append(fp_num) tp_num += 1 else: if current_class is None: current_class = False continue if current_class: roc_tp.append(tp_num) roc_fp.append(fp_num) fp_num += 1 precision = tp_num / (tp_num + fp_num) p_r_curve.append((precision, tp_num)) roc_tp.append(tp_num) roc_fp.append(fp_num) p_r_curve = [(x, y / tp_num) for x, y in p_r_curve] max_ps = [] for i in range(11): chk_num = i / 10 m_p = 0 for x, y in p_r_curve: if y <= chk_num: continue if x > m_p: m_p = x max_ps.append(m_p) mAP = sum(max_ps) / len(max_ps) uz = list(zip(*p_r_curve)) make_p_r_curve(uz[0], uz[1], output_dir) normalized_tp = [x / tp_num for x in roc_tp] if fp_num > 0: normalized_fp = [x / fp_num for x in roc_fp] make_roc_chart(normalized_tp, normalized_fp, output_dir) filtered_fp_list = [fp for fp in fp_list if fp[1] != 'correct'] print(f'True Positives: {tp_num}') print(f'False Positives: {fp_num}') return filtered_fp_list
def convert_annotations(xml_dir, output_dir): for xml in glob.glob(os.path.join(xml_dir, '*.xml')): bname = os.path.basename(xml)[:-4] xlist = xml2list(xml) writer = Writer(f'{bname}.png', 1920, 1920) # three lists. One Equation/equation label, the other not those eq_label_list = [x for x in xlist if x[0] == 'Equation label'] eq_list = [x for x in xlist if x[0] == 'Equation'] not_eq_list = [x for x in xlist if x not in eq_list and x not in eq_label_list] for x in not_eq_list: if x[0] == 'Table Note': writer.addObject('Body Text', *x[1]) elif x[0] == 'Figure Note': writer.addObject('Body Text', *x[1]) elif x[0] == 'Abstract': writer.addObject('Body Text', *x[1]) else: writer.addObject(x[0], *x[1]) # Now for each equation label, we associate the closest equation to the left of the equation label # Remember that el[1] and x[1] are coordinates in (tl_x, tl_y, br_x, br_y) form eq_el_map = {} print(eq_label_list) print('---') print(eq_list) print(xml) for el in eq_label_list: el_midpoint = el[1][1] + int((el[1][3] - el[1][1]) / 2) in_row = [x for x in eq_list if x[1][1] <= el_midpoint <= x[1][3]] # simple interval checks #in_row = [x for x in eq_list if x[1][1] <= el[1][1] <= x[1][3] or x[1][1] <= el[1][3] <= x[1][3] or el[1][1] <= x[1][1] <= el[1][3]] dists = [el[1][0] - x[1][2] for x in in_row] # only consider positive distances (left of obj) dists = [x if x >= 0 else float('inf') for x in dists] # Sometimes the equation label is really weirdly formatted. In this case # just drop the equation label ind = None if len(dists) == 0: continue min_dist = min(dists) if min_dist == float('inf') or min_dist > 700: continue for i, d in enumerate(dists): if d == min_dist: ind = i break assert ind is not None assoc = in_row[ind] eq_list = [x for x in eq_list if x != assoc] el = (el[0], tuple(el[1])) eq_el_map[el] = assoc for eq in eq_list: writer.addObject(eq[0], *eq[1]) for el in eq_el_map: eq = eq_el_map[el] print(eq) print(el) print('----') new_coords = [eq[1][0], min(eq[1][1], el[1][1]), el[1][2], max(eq[1][3], el[1][3])] assert new_coords[3] > new_coords[1] assert new_coords[2] > new_coords[0] writer.addObject(eq[0], *new_coords) save_path = os.path.join(output_dir, f'{bname}.xml') writer.save(save_path)