def log_pdf(doc, score, scores, predict_text, answer_text): fname = get_pdf_path(doc.slug) try: pdf = pdfplumber.open(fname) except Exception: # If the file's not there, that's fine -- we use available PDFs to # define what to see print(f"Cannot open pdf {fname}") return print(f"Rendering output for {fname}") # Get the correct answers: find the indices of the token(s) labelled 1 target_idx = [idx for (idx, val) in enumerate(doc.labels) if val == 1] # Draw the machine output: get a score for each token page_images = [] for pagenum, page in enumerate(pdf.pages): im = page.to_image(resolution=300) # training data has 0..1 for page range (see create-training-data.py) num_pages = len(pdf.pages) if num_pages > 1: current_page = pagenum / float(num_pages - 1) else: current_page = 0.0 # Draw guesses rel_score = scores / score page_match = np.isclose(doc.tokens["page"], current_page) for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples(): if rel_score[token.Index] == 1: w = 5 s = "magenta" elif rel_score[token.Index] >= 0.75: w = 3 s = "red" else: w = 1 s = "red" im.draw_rect(docrow_to_bbox(token), stroke=s, stroke_width=w, fill=None) # Draw target tokens target_toks = [ doc.tokens.iloc[i] for i in target_idx if np.isclose(doc.tokens.iloc[i]["page"], current_page) ] rects = [docrow_to_bbox(t) for t in target_toks] im.draw_rects(rects, stroke="blue", stroke_width=3, fill=None) page_images.append(wandb.Image(im.annotated, caption="page " + str(pagenum))) # get best matching score of any token in the training data match = doc.tokens[SINGLE_CLASS_PREDICTION].max() caption = ( f"{doc.slug} guessed:{predict_text} answer:{answer_text} match:{match:.2f}" ) verdict = dollar_match(predict_text, answer_text) return verdict, caption, page_images
def test_docrow_to_bbox(x0, y0, x1, y1, mh): t = BoundingBox(x0=x0, x1=x1, y0=y0, y1=y1) bbox0 = docrow_to_bbox(t, min_height=None) bbox1 = docrow_to_bbox(t) bbox2 = docrow_to_bbox(t, min_height=mh) for box in (bbox0, bbox1, bbox2): assert box.x0 == Decimal(x0) assert box.x1 == Decimal(x1) assert box.y1 == Decimal(y1) assert bbox0.y0 == Decimal(y0) # Floating point arithmetic, yo. assert bbox1.y1 - bbox1.y0 >= 10 or isclose(bbox1.y1 - bbox1.y0, 10) assert bbox2.y1 - bbox2.y0 >= mh or isclose(bbox2.y1 - bbox2.y0, mh)
def render_tokenized_pdf(doc): fname = get_pdf_path(doc.slug) try: pdf = pdfplumber.open(fname) except Exception: # If the file's not there, that's fine -- we use available PDFs to # define what to see print(f"Cannot open pdf {fname}") return page_images = [{ "image": page.to_image(resolution=300), "rects": [], "lines": [] } for page in pdf.pages] for token in doc.tokens.itertuples(): page_num = int(token.page) if page_num < len(page_images): page_images[page_num]["rects"].append(docrow_to_bbox(token)) for indices in np.argwhere(doc.adjacency_matrix): first_index, second_index = indices if first_index != second_index: first_token = doc.tokens.iloc[first_index] second_token = doc.tokens.iloc[second_index] page = int(first_token.page) line = ( (Decimal(float(first_token.x0)), Decimal(float(first_token.y1))), (Decimal(float(second_token.x0)), Decimal(float(second_token.y1))), ) page_images[page_num]["lines"].append(line) for page in page_images: image, rects, lines = page["image"], page["rects"], page["lines"] image.draw_rects(rects, stroke="blue", stroke_width=3, fill=None) print(f"first lines = {lines[:5]}") image.draw_lines(lines, stroke="green", stroke_width=3) return [page["image"] for page in page_images]