Пример #1
0
def pdf_paths(*paths):
    for path in paths:
        path = Path(path)
        if path.is_file():
            if path.suffix != ".pdf":
                logger.warning(f"Skipping non-pdf '{path}'")
                continue
            yield path
        elif path.is_dir():
            for file_path in path.glob("*.pdf"):
                yield file_path
        else:
            logger.warning(f"'{path}' is not a file or directory")
Пример #2
0
def extend_and_write_docs(
    source_dir,
    manifest,
    pq_index,
    out_path,
    max_token_count,
    use_adjacency_matrix=False,
):
    """Split data into individual documents, add features, and write to parquet."""

    token_files = {p.stem: p for p in source_dir.glob("*.parquet")}

    jobqueue = []
    for row in manifest.itertuples():
        slug = row.file_id
        if slug not in token_files:
            logger.error(f"No token file for {slug}")
            continue
        labels = {}
        for label_col in LABEL_COLS:
            labels[label_col] = getattr(row, label_col)
            if not labels[label_col]:
                logger.warning(f"'{label_col}' for {slug} is empty")
        jobqueue.append(
            {
                "token_file": token_files[slug],
                "dest_file": out_path / f"{slug}.parquet",
                "graph_file": out_path / f"{slug}.graph",
                "labels": labels,
                "max_token_count": max_token_count,
                "use_adjacency_matrix": use_adjacency_matrix,
            }
        )

    # Spin up a bunch of jobs to do the conversion
    with ThreadPoolExecutor() as executor:
        doc_jobs = []
        for kwargs in jobqueue:
            doc_jobs.append(executor.submit(process_document_tokens, **kwargs))

        logger.debug("Waiting for jobs to complete")
        progress = tqdm(as_completed(doc_jobs), total=len(doc_jobs))
        doc_results = [j.result() for j in progress]

    logger.debug(f"Writing document index to {pq_index}...")
    doc_index = pd.DataFrame(doc_results).set_index("slug", drop=True)
    doc_index.to_parquet(pq_index)
Пример #3
0
 def slug_to_doc(slug, labels):
     pq_path = pq_root / f"{slug}.parquet"
     if config.use_data_cache:
         cache_path = cache_root / f"{slug}.joblib"
         try:
             with open(cache_path, "rb") as infile:
                 return load(infile)
         except FileNotFoundError:
             logger.debug(f"Cache file {cache_path} not found")
     try:
         doc = Document.from_parquet(slug, labels, pq_path, config)
     except AssertionError:
         logger.warning(f"No correct answers for {slug}, skipping")
         return None
     if config.use_data_cache:
         with open(cache_path, "wb") as outfile:
             dump(doc, outfile)
         logger.debug(f"Wrote document to cache file {cache_path}")
     return doc
Пример #4
0
def create_token_doc(pdf_path, token_dir=TOKEN_DIR, overwrite=False):
    pdf_path, token_dir = Path(pdf_path), Path(token_dir)
    assert pdf_path.is_file() and pdf_path.suffix == ".pdf"

    slug = pdf_path.stem
    token_path = token_dir / f"{slug}.parquet"
    if token_path.is_file():
        if overwrite:
            logger.warning(f"Overwriting {token_path}")
        else:
            return

    try:
        tokens = tokenize_pdf(pdf_path)
    except EOFError:
        logger.warning(f"pdfplumber found no tokens in '{pdf_path}'")
        return
    except Exception as e:
        logger.error(f"Unable to tokenize {pdf_path}: {e}")
        return

    token_dir.mkdir(parents=True, exist_ok=True)
    tokens.to_parquet(token_path)
    return token_path