예제 #1
0
def inspect(file: Union[str, bytes, Path]) -> PdfRedactionsDict:
    """
    Inspect a file for bad redactions and return a Dict with their info

    :file: The PDF to process, as bytes if you have the file in memory (useful
    if it's coming from the network), as a unicode string if you know the
    path to the file on your local disk, or as a pathlib.Path object.
    :return: A dict with the bad redaction information. If no bad redactions
    are found, returns an empty dict.
    """
    if type(file) == bytes:
        pdf = Document(stream=file, filetype="pdf")
    elif type(file) == str and file.startswith("https://"):
        r = requests.get(file, timeout=10)
        r.raise_for_status()
        pdf = Document(stream=r.content, filetype="pdf")
    else:
        # str filepath or Pathlib Path
        pdf = Document(file)

    bad_redactions = {}
    for page_number, page in enumerate(pdf, start=1):
        redactions = get_bad_redactions(page)
        if redactions:
            bad_redactions[page_number] = redactions
    pdf.close()
    bad_redactions = check_if_all_dates(bad_redactions)

    return bad_redactions
예제 #2
0
for name in names_list:
    print(f"Parsing: {name}")
    word_in_pages = []
    results.write(f"{name} ")
    for page in range(start, end):
        lines = unescape(doc[page].getText("html")).split("\n")
        for line in lines:
            item = line.replace("\t", " ")
            bs = BeautifulSoup(item, "html.parser")
            if bs.p is not None:
                if name.title() in str(bs.p.text) and str(
                        bs.p["style"]).find("top:84pt") == -1:
                    font_index = str(bs.p.span["style"]).find("font-size")
                    font_size = str(bs.p.span["style"][font_index:]).lstrip(
                        "font-size:")[:2].rstrip("p").rstrip(".")
                    string_to_write = str(page + page_delta)
                    if int(font_size) < font_standard:
                        string_to_write += "n"
                    if string_to_write in word_in_pages:
                        continue
                    else:
                        print("\t'{}' found in page {}".format(name, page))
                        word_in_pages.append(string_to_write)
                    string_to_write = ""
    merged_pages = merge_results(word_in_pages)
    write_to_file(merged_pages, results)
    results.write("\n")
results.close()
doc.close()
results_cleaner(results_file)