Пример #1
0
def log_pdf(doc, score, scores, predict_text, answer_text):
    fname = get_pdf_path(doc.slug)
    try:
        pdf = pdfplumber.open(fname)
    except Exception:
        # If the file's not there, that's fine -- we use available PDFs to
        # define what to see
        print(f"Cannot open pdf {fname}")
        return

    print(f"Rendering output for {fname}")

    # Get the correct answers: find the indices of the token(s) labelled 1
    target_idx = [idx for (idx, val) in enumerate(doc.labels) if val == 1]

    # Draw the machine output: get a score for each token
    page_images = []
    for pagenum, page in enumerate(pdf.pages):
        im = page.to_image(resolution=300)

        # training data has 0..1 for page range (see create-training-data.py)
        num_pages = len(pdf.pages)
        if num_pages > 1:
            current_page = pagenum / float(num_pages - 1)
        else:
            current_page = 0.0

        # Draw guesses
        rel_score = scores / score
        page_match = np.isclose(doc.tokens["page"], current_page)
        for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples():
            if rel_score[token.Index] == 1:
                w = 5
                s = "magenta"
            elif rel_score[token.Index] >= 0.75:
                w = 3
                s = "red"
            else:
                w = 1
                s = "red"
            im.draw_rect(docrow_to_bbox(token), stroke=s, stroke_width=w, fill=None)

        # Draw target tokens
        target_toks = [
            doc.tokens.iloc[i]
            for i in target_idx
            if np.isclose(doc.tokens.iloc[i]["page"], current_page)
        ]
        rects = [docrow_to_bbox(t) for t in target_toks]
        im.draw_rects(rects, stroke="blue", stroke_width=3, fill=None)

        page_images.append(wandb.Image(im.annotated, caption="page " + str(pagenum)))

    # get best matching score of any token in the training data
    match = doc.tokens[SINGLE_CLASS_PREDICTION].max()
    caption = (
        f"{doc.slug} guessed:{predict_text} answer:{answer_text} match:{match:.2f}"
    )
    verdict = dollar_match(predict_text, answer_text)
    return verdict, caption, page_images
Пример #2
0
def test_docrow_to_bbox(x0, y0, x1, y1, mh):
    t = BoundingBox(x0=x0, x1=x1, y0=y0, y1=y1)
    bbox0 = docrow_to_bbox(t, min_height=None)
    bbox1 = docrow_to_bbox(t)
    bbox2 = docrow_to_bbox(t, min_height=mh)
    for box in (bbox0, bbox1, bbox2):
        assert box.x0 == Decimal(x0)
        assert box.x1 == Decimal(x1)
        assert box.y1 == Decimal(y1)
    assert bbox0.y0 == Decimal(y0)
    # Floating point arithmetic, yo.
    assert bbox1.y1 - bbox1.y0 >= 10 or isclose(bbox1.y1 - bbox1.y0, 10)
    assert bbox2.y1 - bbox2.y0 >= mh or isclose(bbox2.y1 - bbox2.y0, mh)
Пример #3
0
def render_tokenized_pdf(doc):

    fname = get_pdf_path(doc.slug)
    try:
        pdf = pdfplumber.open(fname)
    except Exception:
        # If the file's not there, that's fine -- we use available PDFs to
        # define what to see
        print(f"Cannot open pdf {fname}")
        return

    page_images = [{
        "image": page.to_image(resolution=300),
        "rects": [],
        "lines": []
    } for page in pdf.pages]

    for token in doc.tokens.itertuples():
        page_num = int(token.page)
        if page_num < len(page_images):
            page_images[page_num]["rects"].append(docrow_to_bbox(token))

    for indices in np.argwhere(doc.adjacency_matrix):
        first_index, second_index = indices
        if first_index != second_index:
            first_token = doc.tokens.iloc[first_index]
            second_token = doc.tokens.iloc[second_index]
            page = int(first_token.page)
            line = (
                (Decimal(float(first_token.x0)),
                 Decimal(float(first_token.y1))),
                (Decimal(float(second_token.x0)),
                 Decimal(float(second_token.y1))),
            )
            page_images[page_num]["lines"].append(line)

    for page in page_images:
        image, rects, lines = page["image"], page["rects"], page["lines"]
        image.draw_rects(rects, stroke="blue", stroke_width=3, fill=None)
        print(f"first lines = {lines[:5]}")
        image.draw_lines(lines, stroke="green", stroke_width=3)

    return [page["image"] for page in page_images]