def delivery_type(page: pdf.PageObject) -> dict: result = dict() page_text = page.extractText() if re.search("Tracking", page_text) is not None: result["type"] = "Label" result["amount"] = len((re.findall("Tracking", page_text))) return result elif is_it_blank_page(page): result["type"] = "Empty Page" result["amount"] = 0 return result else: result["type"] = "Shipping List" result["amount"] = 1 return result
def is_it_blank_page(page: pdf.PageObject) -> bool: if page.extractText() == '' and '/XObject' not in page['/Resources']: return True return False
from PyPDF2.pdf import PageObject from PyPDF2 import PdfFileReader, PdfFileWriter file_path = "Daycoval.pdf" pdf = PdfFileReader(file_path) with open('Daycoval.txt', 'w') as f: for page_num in range(pdf.numPages): print('Page: {0}'.format(page_num)) PageObject = pdf.getPage(page_num) try: txt = PageObject.extractText() print(''.center(100, '-')) except: pass else: f.write('Page {0}\n'.format(page_num +1)) f.write(''.center(100, '-')) f.write(txt) f.close()