def inspect(file: Union[str, bytes, Path]) -> PdfRedactionsDict: """ Inspect a file for bad redactions and return a Dict with their info :file: The PDF to process, as bytes if you have the file in memory (useful if it's coming from the network), as a unicode string if you know the path to the file on your local disk, or as a pathlib.Path object. :return: A dict with the bad redaction information. If no bad redactions are found, returns an empty dict. """ if type(file) == bytes: pdf = Document(stream=file, filetype="pdf") elif type(file) == str and file.startswith("https://"): r = requests.get(file, timeout=10) r.raise_for_status() pdf = Document(stream=r.content, filetype="pdf") else: # str filepath or Pathlib Path pdf = Document(file) bad_redactions = {} for page_number, page in enumerate(pdf, start=1): redactions = get_bad_redactions(page) if redactions: bad_redactions[page_number] = redactions pdf.close() bad_redactions = check_if_all_dates(bad_redactions) return bad_redactions
for name in names_list: print(f"Parsing: {name}") word_in_pages = [] results.write(f"{name} ") for page in range(start, end): lines = unescape(doc[page].getText("html")).split("\n") for line in lines: item = line.replace("\t", " ") bs = BeautifulSoup(item, "html.parser") if bs.p is not None: if name.title() in str(bs.p.text) and str( bs.p["style"]).find("top:84pt") == -1: font_index = str(bs.p.span["style"]).find("font-size") font_size = str(bs.p.span["style"][font_index:]).lstrip( "font-size:")[:2].rstrip("p").rstrip(".") string_to_write = str(page + page_delta) if int(font_size) < font_standard: string_to_write += "n" if string_to_write in word_in_pages: continue else: print("\t'{}' found in page {}".format(name, page)) word_in_pages.append(string_to_write) string_to_write = "" merged_pages = merge_results(word_in_pages) write_to_file(merged_pages, results) results.write("\n") results.close() doc.close() results_cleaner(results_file)