def pdf_paths(*paths): for path in paths: path = Path(path) if path.is_file(): if path.suffix != ".pdf": logger.warning(f"Skipping non-pdf '{path}'") continue yield path elif path.is_dir(): for file_path in path.glob("*.pdf"): yield file_path else: logger.warning(f"'{path}' is not a file or directory")
def extend_and_write_docs( source_dir, manifest, pq_index, out_path, max_token_count, use_adjacency_matrix=False, ): """Split data into individual documents, add features, and write to parquet.""" token_files = {p.stem: p for p in source_dir.glob("*.parquet")} jobqueue = [] for row in manifest.itertuples(): slug = row.file_id if slug not in token_files: logger.error(f"No token file for {slug}") continue labels = {} for label_col in LABEL_COLS: labels[label_col] = getattr(row, label_col) if not labels[label_col]: logger.warning(f"'{label_col}' for {slug} is empty") jobqueue.append( { "token_file": token_files[slug], "dest_file": out_path / f"{slug}.parquet", "graph_file": out_path / f"{slug}.graph", "labels": labels, "max_token_count": max_token_count, "use_adjacency_matrix": use_adjacency_matrix, } ) # Spin up a bunch of jobs to do the conversion with ThreadPoolExecutor() as executor: doc_jobs = [] for kwargs in jobqueue: doc_jobs.append(executor.submit(process_document_tokens, **kwargs)) logger.debug("Waiting for jobs to complete") progress = tqdm(as_completed(doc_jobs), total=len(doc_jobs)) doc_results = [j.result() for j in progress] logger.debug(f"Writing document index to {pq_index}...") doc_index = pd.DataFrame(doc_results).set_index("slug", drop=True) doc_index.to_parquet(pq_index)
def slug_to_doc(slug, labels): pq_path = pq_root / f"{slug}.parquet" if config.use_data_cache: cache_path = cache_root / f"{slug}.joblib" try: with open(cache_path, "rb") as infile: return load(infile) except FileNotFoundError: logger.debug(f"Cache file {cache_path} not found") try: doc = Document.from_parquet(slug, labels, pq_path, config) except AssertionError: logger.warning(f"No correct answers for {slug}, skipping") return None if config.use_data_cache: with open(cache_path, "wb") as outfile: dump(doc, outfile) logger.debug(f"Wrote document to cache file {cache_path}") return doc
def create_token_doc(pdf_path, token_dir=TOKEN_DIR, overwrite=False): pdf_path, token_dir = Path(pdf_path), Path(token_dir) assert pdf_path.is_file() and pdf_path.suffix == ".pdf" slug = pdf_path.stem token_path = token_dir / f"{slug}.parquet" if token_path.is_file(): if overwrite: logger.warning(f"Overwriting {token_path}") else: return try: tokens = tokenize_pdf(pdf_path) except EOFError: logger.warning(f"pdfplumber found no tokens in '{pdf_path}'") return except Exception as e: logger.error(f"Unable to tokenize {pdf_path}: {e}") return token_dir.mkdir(parents=True, exist_ok=True) tokens.to_parquet(token_path) return token_path