Exemplo n.º 1
0
    def pdf_to_txt(
        self,
        filename: Union[str, os.PathLike],
        output_folder: Union[str, os.PathLike],
        first_page: int = 1,
        last_page: Optional[int] = None,
    ) -> None:
        basename = Path(filename).stem
        # TODO Remove num_pages
        num_pages = pdf2image.pdfinfo_from_path(filename)['Pages']
        if last_page is None or last_page > num_pages:
            last_page = int(num_pages)

        # TODO
        # for page in p.get_pages('filename'): -> sorted list of strings (or list of strings + titles, or markup)
        for page in range(first_page, last_page + 1):
            output_filename = Path(output_folder) / f'{basename}_{page:04}.txt'
            self.p.extract_text(
                filename,
                output_path=output_filename,
                encoding=self.encoding,
                html=self.html,
                sort=self.sort,
                ignore_beads=self.ignore_beads,
                start_page=page,
                end_page=page,
                console=self.console,
            )
        logger.success(f'Extracted: {basename}, pages: {num_pages}')
 def pdf_to_txt(
     self,
     filename: Union[str, os.PathLike],
     output_folder: Union[str, os.PathLike],
     first_page: int = 1,
     last_page: Optional[int] = None,
 ) -> None:
     basename = Path(filename).stem
     num_pages = pdf2image.pdfinfo_from_path(filename)['Pages']
     if last_page is None or last_page > num_pages:
         last_page = int(num_pages)
     pagestr = StringIO()
     with open(filename, 'rb') as fp_in:
         parser = PDFParser(fp_in)
         doc = PDFDocument(parser)
         rsrcmgr = PDFResourceManager()
         device = TextConverter(rsrcmgr, pagestr, laparams=LAParams())
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         for i, page in enumerate(PDFPage.create_pages(doc)):
             if i not in range(first_page - 1, last_page):
                 continue
             interpreter.process_page(page)
             with open(
                     Path(output_folder) / f'{basename}_{i+1:04}.txt',
                     'w') as fp_out:
                 fp_out.write(pagestr.getvalue())
             pagestr.truncate(0)
             pagestr.seek(0)
     logger.success(f'Extracted: {basename}, pages: {num_pages}')
Exemplo n.º 3
0
    def process_pdf(self, filename, save_file_path):
        self.serial_no = 0
        self.outfile = open(save_file_path, "w")

        info = pdfinfo_from_path(filename, userpw=None, poppler_path=None)
        nPages = info["Pages"]
        page_no = 0
        while page_no < nPages:
            pages = convert_from_path(filename, dpi=200, first_page=page_no + 1, last_page = min(page_no + 10, nPages))
            for page in pages:
                page_no += 1
                if page_no <= 2 or page_no == nPages:
                    continue
                savepath = 'tmp10.jpg'
                page.save(savepath, 'JPEG')
                print("[process_pdf] Processing {}th page".format(page_no))
                self.process_img(savepath)
                self.outfile.flush()

        # pages = convert_from_path(filename, 500)
        # cnt = 0
        # nPages = len(pages)
        # print(self.get_current_time())
        # for i in range(2, nPages - 1):
        #     page = pages[i]
        #     cnt += 1
        #     # page.save('out2/out{}.jpg'.format(cnt), 'JPEG')
        #     # print("{}.jpg saved".format(cnt))
        #     savepath = 'tmp.jpg'
        #     page.save(savepath, 'JPEG')
        #     print("[process_pdf] Processing {}th page".format(cnt))
        #     self.process_img(savepath)
        #     self.outfile.flush()
        self.outfile.close()
        print(self.get_current_time())
Exemplo n.º 4
0
def find_pdfs(pdfs_dir, recursive=True, exclude_basenames=None):
    """ Find the invoice PDF file from the specified directory. The invoice
        PDF file should contains only one page.
    """
    if not pdfs_dir.endswith('/'):
        pdfs_dir += '/'
    # for windows, iglob is case-insensitive
    if recursive is True:
        pdfs = glob(pdfs_dir + '**/*.pdf', recursive=True)
    else:
        pdfs = glob(pdfs_dir + '*.pdf', recursive=False)
    if not pdfs:
        raise MinvoiceException('No invoice PDF file found!')
    pdfs_return = []
    for pdf_path in pdfs:
        if exclude_basenames is not None and \
                basename(pdf_path) in exclude_basenames:
            continue
        page_count = pdfinfo_from_path(pdf_path)["Pages"]
        if page_count != 1:
            msg = """Fatal error in processing PDF file {}: the invoice should
be one page per file""".format(pdf_path)
            raise MinvoiceException(msg)
        pdfs_return.append(pdf_path)
    return pdfs_return
Exemplo n.º 5
0
def pdf_stats() -> Dict[str, int]:
    tot_pages = []
    for file in Path(CONFIG.pdf_dir).glob('*.pdf'):
        tot_pages.append(pdf2image.pdfinfo_from_path(file)['Pages'])
    return {
        'files': len(tot_pages),
        'pages': sum(tot_pages),
        'mean': round(sum(tot_pages) / len(tot_pages)),
        'median': round(median(tot_pages)),
    }
Exemplo n.º 6
0
    def ingest_pdf(self):
        # First, convert PDF to images.
        with tempfile.TemporaryDirectory() as path:
            page_count = pdfinfo_from_path(self.pdf)["Pages"]
            print(page_count)
            print('Ingesting ' + filename)
            start = time.time()
            pages = []
            page = 1
            all_texts = []
            # Convert the pdf in batches of 10 pages to avoid large memory usage.
            for page_num in range(1, page_count, 10):
                # Convert the pdf pages to jpeg images.
                block = convert_from_path(filename,
                                          dpi=200,
                                          first_page=page_num,
                                          last_page=min(
                                              page_num + 10 - 1, page_count),
                                          output_folder=path,
                                          thread_count=4,
                                          fmt='jpeg')
                print('Converting Pages ' + str(page_num) + "-" +
                      str(page_num + 9))
                batch_page_numbers = []
                batch_pages = []
                for file in block:
                    print('Ingesting page ' + str(page) + ' of ' +
                          str(page_count))
                    batch_pages.append(file)
                    batch_page_numbers.append(page)
                    page += 1
                # Extract the text from the jpeg images.
                with mp.Pool(mp.cpu_count()) as pool:
                    texts = pool.starmap(self.get_text,
                                         zip(batch_pages, batch_page_numbers))
                    all_texts.extend(texts)

            end = time.time()
            print('Finished ingesting pdf in ' + str(end - start) +
                  ' seconds.')

            # Open the file in append mode so that
            # All contents of all images are added to the same file
            # Creating a text file to write the output
            outfile = "out_text2.txt"
            f = open(outfile, "a")
            for text in all_texts:
                f.write(text)
            # Close the file after writing all the text.
            f.close()
Exemplo n.º 7
0
def _extract_text(pdf_path):
    print("    [*] Extracting text from document")

    # This list will hold all the relevant word tokens that are extracted
    relevant_text_list = []

    for page_num in range(0, pdfinfo_from_path(pdf_path)["Pages"]):
        try:
            print("    [*] Converting PDF page to image")
            page_image = convert_from_path(pdf_path, dpi = 500, first_page = page_num + 1, last_page = page_num + 1)[0]

            # Recognize the text as string in image using pytesserct then
            # split the text by newline, strip leading/trailing whitespace,
            # and remove any empty strings that are present
            text = str(((pytesseract.image_to_string( page_image ))))
            text = list(filter(None, [element.strip() for element in text.split("\n")] ))

            # Pages 2 onward all start off with a four line header stating
            # the date, type of meeting, resolution number, and the page
            # number in the format "Page {Number}". 
            # 
            # These lines make extracting relevant information harder, so
            # they are trimmed off using the index of the "Page {Number}" line
            # (which is guaranteed to always come last).
            #
            # Sometimes a blank page will be encountered, so the text must be
            # checked to ensure this isn't the case
            if (page_num + 1) > 1 and text != []: 
                last_header_sentence_index = text.index("Page " + num2words(page_num + 1).capitalize())
                del text[0:last_header_sentence_index + 1]

            relevant_text_list += text

            # The page with the 'Votes:' section concludes the informal
            # information about the resolution itself, the rest of the pages
            # contain extra attachements. These don't need to be parsed, as the
            # link to them will be included on the resolution's page
            if "Votes:" in text:
                break
        finally:
            page_image.close()

    return relevant_text_list
Exemplo n.º 8
0
 def output_images_to_directory(
         self,
         output_directory: str,
         dpi: int = 200,
         thread_count: int = multiprocessing.cpu_count(),
 ) -> None:
     page_count = pdf2image.pdfinfo_from_path(
         self._pdf_file_path, userpw=self._password)["Pages"]
     first_page = self._skip_first
     last_page = page_count - self._skip_last
     images = pdf2image.convert_from_path(
         self._pdf_file_path,
         dpi=dpi,
         first_page=first_page,
         last_page=last_page,
         thread_count=thread_count,
         userpw=self._password,
     )
     for image_number, image in enumerate(images):
         image.save(
             os.path.join(output_directory,
                          self.get_image_filename(image_number)))
Exemplo n.º 9
0
def _nb_pages_in_pdf(filename: str) -> int:
    return pdfinfo_from_path(filename)['Pages']
pastePath = r"C:\Users\{}\Desktop\pdfInsert\{}".format(host, [
    n for n in os.listdir(r"C:\Users\{}\Desktop\pdfInsert".format(host))
    if n.endswith(".png") or n.endswith(".jpg") or n.endswith(".jpeg")
][0])
# image path using a list comprehension that searches for the image file
pdfPath = r"C:\Users\{}\Desktop\pdfInsert\{}".format(host, [
    n for n in os.listdir(r"C:\Users\{}\Desktop\pdfInsert".format(host))
    if n.endswith(".pdf")
][0])
# pdf path using a list comprehension that searches for the .pdf file

Mode = input(
    "Keep aspect ratio? y/n: "
)  # 'yes' utilizes the thumbnail() function and 'no' the resize() function
page = int(input("PDF page number (f.ex '1' for first page): "))
pageCount = int(pdfinfo_from_path(pdfPath)["Pages"])
pngList = convert_from_path(
    pdfPath, first_page=1,
    last_page=pageCount)  # creates a list with all the pdf pages as png's
pngList[page - 1].save(r"C:\Users\{}\Desktop\pdfInsert\temporary.png".format(
    host))  # 'page-1' because we count from 0, not from 1

while True:
    pasteLocation = input("\nX/Y coordinates (eg. '800,50'): ")
    pasteSize = input("Width/Height in pixels (eg. ' 500,600'): ")

    if pasteSize == "" and pasteLocation == "":  # if your settings dont change for one loop, it automatically saves the pdf file and exits the program
        pngList[page - 1] = png
        pngList[0].save(r"C:\Users\{}\Desktop\pdfInsert.pdf".format(host),
                        "PDF",
                        resolution=100.0,
Exemplo n.º 11
0
def ocr_with_path(root_path):
    # Create a list of pdf_path
    if os.path.isdir(root_path):
        pdf_list = [
            os.path.join(root_path, item) for item in os.listdir(root_path)
            if item.endswith(".pdf")
        ]
    elif os.path.isfile(root_path) and root_path.endswith("pdf"):
        pdf_list = [root_path]
    #
    # for each pdf in the pdf_list, do this
    for pdf_path in pdf_list:
        # final OCR file name
        ocr_file_for_pdf_path = pdf_path + ".txt"
        #
        # break the pdf to small pdf of 25 pages using a fn
        small_pdf_paths = break_to_small_pdf_paths_original(
            pdf_path,
            output_directory=None,
            start_page=1,
            end_page=None,
            small_pdf_pages=25,
        )
        # a list for all ocr relaetd to each small_pdf_path
        small_pdf_final_ocr_path_list = []
        # for each 25 pages PDF related to each pdf_path, do this
        for each_small_pdf_path in small_pdf_paths:
            try:
                # get pdf page numbers
                pdf_info = pdfinfo_from_path(each_small_pdf_path,
                                             userpw=None,
                                             poppler_path=None)
                maxPages = pdf_info["Pages"]
                # do ocr for each 25 page PDF by compressing and splitting in 5 pages PDF
                drive_ocr.split_and_ocr_on_drive(
                    pdf_path=each_small_pdf_path,
                    google_key="/path/to/key.json",
                    small_pdf_pages=5,
                    # start_page=None,
                    # start_page=page,
                    # end_page = None,
                    # end_page = min(page+10-1, maxPages),
                    pdf_compression_power=
                    1,  #change it as desire for more/less compression of PDF
                    # detext=True
                )
                # add ocr of each 5 page PDF related to each 25 pages PDF to a list
                each_small_pdf_ocr_path = each_small_pdf_path + ".txt"
                small_pdf_final_ocr_path_list.append(each_small_pdf_ocr_path)
            except Exception as x:
                print(x)
                # print('trying another method now.')
                # drive_ocr.split_to_images_and_ocr(
                #     pdf_path=pdf_path,
                #     google_key="/path/to/key.json",
                #     # small_pdf_pages=10,
                #     # end_page = None,
                #     # pdf_compression_power=3,
                #     # start_page = 1,
                #     #detext=True
                #     )
                continue
        # Combine the ocr segments
        small_pdf_final_ocr_path_list.sort()
        file_helper.concatenate_files(
            input_path_list=small_pdf_final_ocr_path_list,
            output_path=ocr_file_for_pdf_path,
        )
        # clear useless characters
        file_helper.clear_bad_chars_in_file(file_path=ocr_file_for_pdf_path)
        # delete useless PDF and split ocr
        output_directory = os.path.join(
            os.path.dirname(pdf_path),
            Path(pdf_path).stem + "_small_originals")
        if os.path.isdir(output_directory):
            shutil.rmtree(output_directory)
Exemplo n.º 12
0
from pdf2image import convert_from_path
import os

# Path of the pdf
path = "/home/sugar/Downloads/the-ultimate-husband/"
num = "1273-1284"
chapter = "chapter-" + num
PDF_file = "/home/sugar/Downloads/the-ultimate-husband-chapter-" + num + ".pdf"

os.mkdir(path+chapter)
'''
Part #1 : Converting PDF to images
'''

from pdf2image import pdfinfo_from_path,convert_from_path
info = pdfinfo_from_path(PDF_file, userpw=None, poppler_path=None)

# Counter to store images of each page of PDF to image
image_counter = 1

maxPages = info["Pages"]
for page in range(1, maxPages+1, 10) : 
    pages = convert_from_path(PDF_file, dpi=200, first_page=page, last_page = min(page+10-1,maxPages))

    # Iterate through all the pages stored above
    for page in pages:

        # Declaring filename for each page of PDF as JPG
        # For each page, filename will be:
        # PDF page 1 -> page_1.jpg
        # PDF page 2 -> page_2.jpg
Exemplo n.º 13
0
from pdf2image.exceptions import (PDFInfoNotInstalledError, PDFPageCountError,
                                  PDFSyntaxError)
from pdf2image import convert_from_path, convert_from_bytes, pdfinfo_from_path
from PIL import Image
import PIL
import tempfile
import os

directory = './'
for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        info = pdfinfo_from_path('./' + str(filename),
                                 userpw=None,
                                 poppler_path=None)
        maxPages = info["Pages"]
        i = -1
        directory_dir = str(filename) + "_dir"
        parent_dir = "./"
        path = os.path.join(parent_dir, directory_dir)
        os.mkdir(path)

        # with tempfile.TemporaryDirectory() as path:         #output_folder=path,
        for page in range(1, maxPages, 10):
            images_from_path = convert_from_path('./' + str(filename),
                                                 fmt='jpeg',
                                                 dpi=200,
                                                 first_page=page,
                                                 last_page=min(
                                                     page + 10 - 1, maxPages))
            for image in images_from_path:
                i += 1
Exemplo n.º 14
0
def process_forms(paths, params):
    logger.info("list of all paths is ", paths)
    logger.info("list of all params is ", params)
    x = {}
    # setting flag value
    flags = flag_setter(params["spam_flag"], params["structuredParsing_flag"],
                        params["unstructuredParsing_flag"])

    spam_flag = flags[0]
    structuredParsing_flag = flags[1]
    unstructuredParsing_flag = flags[2]

    # checking password
    logger.info("Pass header is: %s" % params["pass_header"])
    passwd = get_passwd(params["pass_header"])
    logger.info("decoded pass is: %s" % passwd)

    if params["doc_type"] == "application/pdf":
        response = validate_password(paths["filepath"], passwd)
        logger.info(response)

        if response['code'] is not None and response['message'] is not None:
            flush_dir(paths)
            return response
        try:
            pdf_info = pdfinfo_from_path(paths["filepath"], userpw=passwd)
            decrypt_pdf(paths, pdf_info, passwd)

            if pdf_info["Pages"] > int(config.get('api', 'max_file_pages')):
                temp_var = pfw()
                for i in range(16):
                    temp_var.addPage((pfr(paths["filepath"], 'rb')).getPage(i))
                with open((paths["inputpath"] + "/trimmed_form.pdf"),
                          'wb') as f:
                    temp_var.write(f)
                os.rename(paths["inputpath"] + "/trimmed_form.pdf",
                          paths["filepath"])
        except:
            pass

    result = find_content_type(paths, params["doc_type"])
    logger.info(result)
    if type(result) is not list:
        flush_dir(paths)
        return result

    content_type = result[0]
    module_name = result[1]
    text = result[2]
    # parsing
    if content_type in ["UNKNOWN", "", " "]:
        response = unstructured_form_parsing(paths, text[0],
                                             unstructuredParsing_flag,
                                             spam_flag)
        flush_dir(paths)
        return response

    # code for structured from parsing
    if not structuredParsing_flag:
        x["model_type"] = content_type
        x["code"] = 5
        x["message"] = "Form is medwatch or CIOMS."
        flush_dir(paths)
        return x
    else:
        response = structured_form_parsing(paths, content_type, module_name)
        flush_dir(paths)
        return response
Exemplo n.º 15
0
def ocr_engine(filename):
    try:
        os.mkdir('output/')
    except:
        pass
    line_dir = 'output/'
    if '.pdf' in filename:
        try:
            f1 = open(filename)
            text1 = f1.read()
            if text1 is []:
                pass
            else:
                return text
        except:
            pass
        try:
            os.mkdir(line_dir + 'pdfout/')
        except:
            pass
        files = glob.glob(line_dir + 'pdfout/*')
        for f in files:
            os.remove(f)

        try:
            from pdf2image import pdfinfo_from_path, convert_from_path
            info = pdfinfo_from_path(filename, userpw=None, poppler_path=None)

            maxPages = info["Pages"]
            for page in range(1, maxPages + 1, 20):
                pages = convert_from_path(filename,
                                          dpi=100,
                                          first_page=page,
                                          last_page=min(
                                              page + 20 - 1, maxPages))

                for page1, i in zip(pages, range(len(pages))):
                    print(i)
                    page1.save(line_dir + 'pdfout/out{0}.jpg'.format(page + i),
                               'JPEG')
        except:
            pass

        line_dir = 'output/'

        try:
            os.mkdir(line_dir)
        except:
            pass

        files = glob.glob(line_dir + '*.png')

        for f in files:
            try:
                os.unlink(f)
            except OSError as e:
                print("Error: %s : %s" % (f, e.strerror))

        utils.truncate_data()

        files = glob.glob(line_dir + 'pdfout/*')

        list_of_line_img = os.listdir(line_dir + 'pdfout/')

        arr = utils.rearrange(list_of_line_img)

        for f in arr:
            try:
                text += utils.Ocr_image('output/pdfout/' + f)

            except:
                text = utils.Ocr_image('output/pdfout/' + f)

        try:
            return text
        except:
            pass

    else:
        utils.truncate_data()
        text = utils.Ocr_image(filename)
        return text