Пример #1
0
    def open(index_file, config):
        """Load the documents referenced by `index_file` and apply `config`."""
        index_file = Path(index_file)
        doc_index = pd.read_parquet(index_file)
        logger.info(f"{len(doc_index)} documents in index")

        if not config.pad_windows:
            # Filter out documents that are too short for the curent config.
            doc_index = doc_index[doc_index["length"] >= config.window_len]

        # Filter out documents that don't have a sufficiently high match.
        # doc_index = doc_index[doc_index["best_match"] >= config.target_thresh]
        logger.info(
            f"After applying config {len(doc_index)} documents are available")

        # Sample down to no more than the requested number of documents.
        num_docs = min(config.len_train, len(doc_index))
        doc_index = doc_index.sample(n=num_docs)

        # Load each of the documents, finishing any necessary feature computation.
        slug_to_doc = caching_doc_getter(index_file, config)
        # docs = concurrent.thread_map(slug_to_doc, doc_index["slug"])

        labels = doc_index[LABEL_COLS.keys()]
        docs = np.array(
            [slug_to_doc(slug, labels.loc[slug]) for slug in doc_index.index])
        docs = docs[docs != None]  # noqa: E711

        return DocumentStore(docs)
Пример #2
0
    )
    parser.add_argument(
        "indexfile",
        nargs="?",
        default=TRAINING_INDEX,
        help="path to index of resulting parquet files",
    )
    parser.add_argument(
        "outdir",
        nargs="?",
        default=TRAINING_DIR,
        help="directory of parquet files",
    )
    parser.add_argument(
        "--max-token-count",
        type=int,
        default=5,
        help="maximum number of contiguous tokens to match against each label",
    )
    parser.add_argument("--log-level", dest="log_level", default="INFO")
    args = parser.parse_args()
    logger.setLevel(args.log_level.upper())

    logger.info(f"Reading {Path(args.manifest).resolve()}")
    manifest = pd.read_csv(args.manifest)

    indir, index, outdir = Path(args.indir), Path(args.indexfile), Path(args.outdir)
    index.parent.mkdir(parents=True, exist_ok=True)
    outdir.mkdir(parents=True, exist_ok=True)
    extend_and_write_docs(indir, manifest, index, outdir, args.max_token_count)
Пример #3
0
def log_wandb_pdfs(doc, doc_log, all_scores):
    fname = get_pdf_path(doc.slug)
    try:
        pdf = pdfplumber.open(fname)
    except Exception:
        # If the file's not there, that's fine -- we use available PDFs to
        # define what to see
        logger.warn(f"Cannot open pdf {fname}")
        return

    logger.info(f"Rendering output for {fname}")

    # map class labels for visualizing W&B bounding boxes
    # TODO: use a type and separate out ground truth
    class_ids_by_field = {
        "gross_amount": 0,
        "flight_to": 1,
        "flight_from": 2,
        "contract_num": 3,
        "advertiser": 4,
        "ground_truth": 5,
    }
    class_id_to_label = {int(v): k for k, v in class_ids_by_field.items()}

    # visualize the first page of the document for which we have ground truth labels
    pagenum = int(doc.tokens[doc.labels > 0].page.min())
    page = pdf.pages[pagenum]
    im = page.to_image(resolution=300)

    # loop over all predictions
    pred_bboxes = []
    for i, score in enumerate(doc_log["score"]):
        rel_score = all_scores[:, i] / score
        page_match = doc.tokens.page == pagenum
        curr_field = doc_log["field"][i]

        # we could remove this threshold and rely entirely
        # on the wandb bbox dynamic threshold
        for token in doc.tokens[page_match & (rel_score > 0.5)].itertuples():
            pred_bboxes.append(
                wandb_bbox(
                    token,
                    score,
                    class_ids_by_field[curr_field],
                    im,
                )
            )
    # draw target tokens
    target_toks = doc.tokens[(doc.labels > 0) & (doc.tokens.page == 0)]
    true_bboxes = [wandb_bbox(t, 1, 5, im) for t in target_toks.itertuples()]

    boxes = {
        "predictions": {
            "box_data": pred_bboxes,
            "class_labels": class_id_to_label,
        },
        "ground_truth": {
            "box_data": true_bboxes,
            "class_labels": class_id_to_label,
        },
    }
    wandb.log({f"pdf/{fname.name}:{pagenum}": wandb.Image(im.annotated, boxes=boxes)})