Пример #1
0
def cmap_extraction_example():
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		from itertools import islice
		page = next(islice(doc.pages(), 2, 3))
		print('page.Resources.Font = {}.'.format(page.Resources.Font))
		print('len(page.Resources.Font) = {}.'.format(len(page.Resources.Font)))

		font = page.Resources.Font['R26']
		print('font.Subtype = {}, bool(font.ToUnicode) = {}.'.format(font.Subtype, bool(font.ToUnicode)))

		# It is PostScript Type1 font, and texts use CMap provided by ToUnicode attribute.
		# Font's ToUnicode attribute contains a reference to the CMap file data stream.
		cmap = font.ToUnicode
		print('type(cmap) = {}.'.format(type(cmap)))
		print('cmap.Filter = {}.'.format(cmap.Filter))

		data = cmap.filtered
		with open('./sample-cmap.txt', 'wb') as fd2:
			 fd2.write(data)
	finally:
		fd.close()
Пример #2
0
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        fd = open(pdf_path, "rb")

        doc = PDFDocument(fd)
        self.viewer = SimplePDFViewer(fd)
        self.pages = [p for p in doc.pages()]
Пример #3
0
def get_text_pypdf(DOI:str) -> str:
    try:
        """gets the text from a given DOI"""
        hostname = socket.gethostname()
        path = pathlib.Path(__file__).parent.absolute()
        name = hostname + str(DOI).replace("/", "") + ".pdf"
        fp = Path(path / "pdfs" / name)  # build filepath
        url = "https://www.medrxiv.org/content/" + str(DOI) + "v1.full.pdf"  # build url
        response = requests.get(url)
        fp.write_bytes(response.content)  # save .pdf

        fd = open(str(path) + "/pdfs/" + name, "rb")  # open with pdfreader
        doc = PDFDocument(fd)
        all_pages = [p for p in doc.pages()]  # get pages
        viewer = SimplePDFViewer(fd)  # use simple viwer
        text = ""
        for p in range(len(all_pages)):  # for each page
            viewer.navigate(p + 1)  # nav to page
            try:
                viewer.render()  # render -> clean and strip
                text += (u"".join(viewer.canvas.strings).encode(sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n'
            except OverflowError:
                pass
        fd.close()
        return text.lower()
    except Exception as e:
        print(e, DOI)
        return ""
Пример #4
0
def encrypted_and_password_protected_pdf_tutorial():
	pdf_filepath = './encrypted-with-qwerty.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd, password='******')

		viewer.render()

		text = ''.join(viewer.canvas.strings)
		print('text = {}.'.format(text))

		#--------------------
		doc = PDFDocument(fd, password='******')

		page_one = next(doc.pages())
		print('page_one.Contents = {}.'.format(page_one.Contents))

		#--------------------
		try:
			doc = PDFDocument(fd, password='******')
			#viewer = SimplePDFViewer(fd, password='******')
		except ValueError as ex:
			print('ValueError raised: {}.'.format(ex))
	finally:
		fd.close()
Пример #5
0
def xobject_image_example():
	pdf_filepath = './example-image-xobject.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		# Extract XObject image.
		page = next(doc.pages())
		print('page.Resources.XObject = {}.'.format(page.Resources.XObject))

		xobj = page.Resources.XObject['img0']
		print('xobj.Type = {}, xobj.Subtype = {}.'.format(xobj.Type, xobj.Subtype))

		pil_image = xobj.to_Pillow()
		#pil_image.save('./extract-logo.png')

		#--------------------
		# Extract Images: a very simple way.
		viewer = SimplePDFViewer(fd)
		viewer.render()

		all_page_images = viewer.canvas.images
		if 'img0' in all_page_images:
			img = all_page_images['img0']
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))

		all_page_inline_images = viewer.canvas.inline_images
		if all_page_inline_images:
			img = all_page_inline_images[0]
			print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype))
	finally:
		fd.close()

	#--------------------
	pdf_filepath = './tutorial-example.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		viewer = SimplePDFViewer(fd)

		# Extract image masks.
		viewer.navigate(5)
		viewer.render()

		inline_images = viewer.canvas.inline_images
		image_mask = next(img for img in inline_images if img.ImageMask)

		pil_img = image_mask.to_Pillow()
		#pil_img.save('./mask.png')
	finally:
		fd.close()
Пример #6
0
def document_tutorial():
	pdf_filepath = './tutorial-example.pdf'

	from io import BytesIO
	with open(pdf_filepath, 'rb') as fd:
		stream = BytesIO(fd.read())
	doc = PDFDocument(stream)

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		print('doc.header.version = {}.'.format(doc.header.version))
		print('doc.metadata = {}.'.format(doc.metadata))

		print('doc.root.Type = {}.'.format(doc.root.Type))
		print('doc.root.Metadata.Subtype = {}.'.format(doc.root.Metadata.Subtype))
		print('doc.root.Outlines.First["Title"] = {}.'.format(doc.root.Outlines.First['Title']))

		#--------------------
		# Browse document pages.
		page_one = next(doc.pages())

		all_pages = [p for p in doc.pages()]
		print('len(all_pages) = {}.'.format(len(all_pages)))

		page_six = next(itertools.islice(doc.pages(), 5, 6))
		page_five = next(itertools.islice(doc.pages(), 4, 5))
		page_eight = all_pages[7]

		print('page_six.MediaBox = {}.'.format(page_six.MediaBox))
		print('page_six.Annots[0].Subj = {}.'.format(page_six.Annots[0].Subj))
		print('page_six.Parent.Type = {}.'.format(page_six.Parent.Type))
		print('page_six.Parent.Count = {}.'.format(page_six.Parent.Count))
		print('len(page_six.Parent.Kids) = {}.'.format(len(page_six.Parent.Kids)))
	finally:
		fd.close()
Пример #7
0
def navigate_pages(doc: PDFDocument, viewer: SimplePDFViewer):
    for i, page in enumerate(doc.pages(), 1):
        # navigate to page
        viewer.navigate(i)
        # render the page
        viewer.render()

        # collapse that ass
        page_strings: List[str] = viewer.canvas.strings.copy()

        merge_ranges = get_line_ranges(strings_list=page_strings)

        page_strings = establish_uniformity(strings_list=page_strings,
                                            line_range_list=merge_ranges)

        get_county_election_office_info(strings_list=page_strings)
Пример #8
0
def init_cmb_from_pdf(month):
    filename = FILE_PATH.format(str(month).zfill(2))
    # logger.info(filename)
    fd = open(filename, "rb")

    doc = PDFDocument(fd)
    all_pages = [p for p in doc.pages()]
    # logger.info(len(all_pages))

    viewer = SimplePDFViewer(fd)
    records = []
    for i in range(len(all_pages)):
        viewer.navigate(i+1)
        viewer.render()
        records = np.append(records, viewer.canvas.strings[4:])

    head = np.where(records == '记账日')[0][0]
    tail = np.where(records == '本期还款总额')[0][-1]
    records = records[head:tail]

    # title_cn = records[:5]
    # title_en = records[5:11]
    records = records[11:]

    column_cn = ['交易日' '交易摘要' '人民币金额' '卡号末四位' '记账日' '交易地金额']
    column_en = ['transaction_date', 'transaction_description', 'transction_amount',
                 'card_number', 'bill_date', 'str_rmb']
    # Data: ['' '掌上生活还款' '-3,011.49' '9978' '07/24' '-3,011.49']

    df = pd.DataFrame(records.reshape(
        [int(len(records)/6), 6]), columns=column_en)

    df['type'] = 'cmb'

    df['transaction_date'] = df['transaction_date'].apply(
        lambda _: '2020/' + _)
    df['transaction_date'] = pd.to_datetime(
        df['transaction_date'], format="%Y/%m/%d", errors='coerce')

    df['transction_amount'] = df['transction_amount'].apply(
        lambda _: decimal_from_value(_))

    df = df[['transaction_date', 'transction_amount',
             'transaction_description', 'type']]

    return df
Пример #9
0
def grade_document(document, verbose = False, point_flags = ('%','%') ) :

	doc = PDFDocument( document )
	viewer = SimplePDFViewer( document )

	grade = 0

	for page_number, page in enumerate( doc.pages() ) :

		if verbose :
			print('------------------')
			print('Page:', page_number)

		viewer.navigate( page_number + 1 )
		viewer.render()

		grade += grade_page( viewer.canvas, verbose = verbose, point_flags = point_flags )

	return grade
Пример #10
0
def get_text(DOI: str) -> str:
    txt = ""
    name = "curr."
    fp = Path(Path.cwd() / "pdfs" / "curr.pdf")  # build filepath
    url = "https://www.medrxiv.org/content/" + DOI + "v1.full.pdf"  # build url
    response = requests.get(url)
    fp.write_bytes(response.content)  # save .pdf

    fd = open(r"pdfs\curr.pdf", "rb")  # open with pdfreader
    doc = PDFDocument(fd)
    all_pages = [p for p in doc.pages()]  # get pages
    viewer = SimplePDFViewer(fd)  # use simple viwer

    for p in range(len(all_pages)):  # for each page
        viewer.navigate(p + 1)  # nav to page
        viewer.render()  # render -? clean and strip
        txt += (u"".join(viewer.canvas.strings).encode(
            sys.stdout.encoding,
            errors='replace').decode("windows-1252")) + '\n'

    return txt
Пример #11
0
def font_extraction_example():
	pdf_filepath = './example-font.pdf'

	try:
		fd = open(pdf_filepath, 'rb')
		doc = PDFDocument(fd)

		page = next(doc.pages())
		print('sorted(page.Resources.Font.keys()) = {}.'.format(sorted(page.Resources.Font.keys())))

		font = page.Resources.Font['T1_0']
		print('font.Subtype = {}, font.BaseFont = {}, font.Encoding = {}.'.format(font.Subtype, font.BaseFont, font.Encoding))

		font_file = font.FontDescriptor.FontFile
		print('type(font_file) = {}.'.format(type(font_file)))
		print('font_file.Filter = {}.'.format(font_file.Filter))

		data = font_file.filtered
		#with open('./sample-font.type1', 'wb') as fd2:
		#	 fd2.write(data)
	finally:
		fd.close()
Пример #12
0
def GradeDocSafe(document, verbose=False, point_flags=defaut_point_flags):
    '''
	grade = GradeDoc(document, verbose = False, point_flags = defaut_point_flags )
	'''

    doc = PDFDocument(document)
    viewer = SimplePDFViewer(document)

    grade = 0

    for page_number, page in enumerate(doc.pages()):

        if verbose:
            print('------------------')
            print('Page:', page_number + 1)

        viewer.navigate(page_number + 1)
        viewer.render()

        grade += GradePage(viewer.canvas.strings,
                           verbose=verbose,
                           point_flags=point_flags)

    return grade
Пример #13
0
# # df = read_pdf("Activity_Report.pdf")

# tabula.convert_into("Activity_Report.pdf", "output.csv", output_format="csv", pages='all')

# import pdftables_api

# c = pdftables_api.Client('r0tedshcbejj')
# c.xlsx('Acentria Activity Report.pdf', 'Acentria Activity Report.xlsx')
# c.xlsx('Integration/Tam_Weaver/RWI Policy Types Sample 0.pdf', 'Integration/Tam_Weaver/RWI_PolicyTypes_Sample0.xlsx')
import pdfreader
from pdfreader import PDFDocument, SimplePDFViewer

fd = open("Acentria Activity Report.pdf", "rb")

doc = PDFDocument(fd)
page = next(doc.pages())
print(doc.root)
# df = tabula.read_pdf('Acentria Activity Report.pdf', pages = 3, lattice = True)[1]
# import os
# import sys
# import pdftables_api
# from PyPDF2 import PdfFileWriter, PdfFileReader

# if len(sys.argv) < 3:
#     command = os.path.basename(__file__)
#     sys.exit('Usage: {} pdf-file page-number, ...'.format(command))

# pdf_input_file = sys.argv[1];
# pages_args = ",".join(sys.argv[2:]).replace(" ","")
# pages_required = [int(p) for p in filter(None, pages_args.split(","))]
if __name__ == "__main__":
    fd = open("bradford_results_2020.pdf", "rb")
    doc = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    parties = ["DEM", "REP", "NPA"]
    offices = [
        "PRESIDENT OF THE UNITED STATES", "ATTORNEY GENERAL",
        "AUDITOR GENERAL", "STATE TREASURER"
    ]
    presidential_candidates = [
        "BERNIE SANDERS", "JOSEPH R. BIDEN", "TULSI GABBARD",
        "DONALD TRUMP  (W)", "Total", "Write-in", "DONALD J. TRUMP",
        "ROQUE ROCKY DE LA FUENTE", "BILL WELD", "BERNIE SANDERS (W)"
    ]
    all_pages = [p for p in doc.pages()]
    with open('20200602__pa__primary__bradford__precinct.csv', 'w',
              newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            "county", "district", "office", "vote type", "party", "candidate",
            "election_day", "absentee", "mail-in", "provisional"
        ])
        for i in range(len((all_pages))):
            #for i in range(3):
            viewer.navigate(i + 1)
            viewer.render()
            text_on_page = viewer.canvas.strings
            #print(text_on_page)
            scraping_one_page(text_on_page, writer)
            print("We are " + str((i / len(all_pages)) * 100) + "% done.")
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("file_name")
    parser.add_argument("-o", "--output", default=None,
                        help="sets the output directory")
    parser.add_argument("-v", "--verbose", default=False, type=str2bool,
                        const=True, nargs='?',
                        help="increase output verbosity")
    parser.add_argument("-fp", "--first_page", default=0,
                        help="first page to extract from")
    parser.add_argument("-lp", "--last_page", default=1000,
                        help="last page to extract from")
    parser.add_argument("-mw", "--min_width", default=200,
                        help="minimum pixel width")
    parser.add_argument("-mh", "--min_height", default=200,
                        help="minimum pixel height")
    parser.add_argument("-xw", "--max_width", default=1210,
                        help="maximum pixel width")
    parser.add_argument("-xh", "--max_height", default=1570,
                        help="maximum pixel height")
    parser.add_argument("-mt", "--make_transparent", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="flag to make the background transparent")
    parser.add_argument("-wt", "--white_to_trans", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="turn white pixels transparent")
    parser.add_argument("-bt", "--black_to_trans", default=True,
                        type=str2bool, const=False, nargs='?',
                        help="turn black pixels transparent")
    parser.add_argument("-wf", "--white_fuzz", default=1,
                        help="fuzz percent (0-100) for white transparency")
    parser.add_argument("-bf", "--black_fuzz", default=1,
                        help="fuzz percent (0-100) for black transparency")
    parser.add_argument("-ims", "--image_string", default="Im",
                        help="string that appears in all image names")
    args = parser.parse_args()

    if args.verbose:
        print(f"Args:\n\t{args}")

    # Obtain the base filename
    file_name = args.file_name
    assert os.path.exists(file_name)
    assert file_name[-4:] == ".pdf", "must provide '.pdf' file"
    base_file_name = file_name[:-4]
    # Split on slashes
    base_file_name = base_file_name.split("/")[-1]
    base_file_name = base_file_name.split("\\")[-1]
    assert len(base_file_name) > 0

    # Make the output directory
    if args.output is not None:
        output = args.output
    else:
        output = base_file_name + "_images"
        if args.verbose:
            print(f"No output file given; outputing to {output}/")
    os.makedirs(output, exist_ok=True)

    # Import the pdfreader
    fd = open(file_name, "rb")
    doc = PDFDocument(fd)

    # Check pages
    assert args.first_page > -1
    assert args.last_page > -1
    assert args.last_page > args.first_page

    # Loop over pages
    for i, page in enumerate(doc.pages()):
        if i < args.first_page:
            continue
        if i >= args.last_page:
            exit()
        if args.verbose:
            nkeys = len(page.Resources.XObject.keys())
            print(f"On page {i} -- {nkeys} XObjects detected")

        # Loop over possible image objects
        for key in page.Resources.XObject.keys():
            if args.image_string in key or "im" in key:
                xobj = page.Resources.XObject[key]
                try:
                    pil_image = xobj.to_Pillow()
                except IndexError:
                    if args.verbose:
                        print(
                            f"IndexError raised on page {i} {key} - skipping"
                        )
                    continue
                width, height = pil_image.size
                if width < args.max_width and height < args.max_height:
                    if width > args.min_width and height > args.min_height:
                        if args.verbose:
                            print(
                                f"Saving image {key} on page{i}: "+\
                                f"(w,h)={pil_image.size}"
                            )
                        pil_image.save(f"{output}/page{i}_{key}.png")
                        if args.make_transparent:
                            _do_transparent(args, i, key, pil_image, output)
    return
Пример #16
0
def main():
    print('Opening pdf and writing to decrypted copy')
    p_pdf = open('AP12176A_20200701_142127.pdf', 'rb')  #this will change
    pdfReader = PyPDF2.PdfFileReader(p_pdf)
    pdfWriter = PyPDF2.PdfFileWriter()
    pdfReader.decrypt('')

    #write all the pages in the unlocked file to a new pdf
    print('Writing to decrypted copy.')
    for pageNum in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

    u_name = input('File name for decrypted copy? (excluding ".pdf"):  '
                   )  #prompt for the name of the .txt output file
    u_file = u_name + '.pdf'

    #pdfOutputFile = open(u_file, 'wb')
    print('Finishing writing to decrypted copy')
    pdfOutputFile = open('temp_pdf.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)  #write to the temporary unlocked pdf

    pdfOutputFile.close()
    p_pdf.close()

    #open the unlocked pdf.  We'll use PDFDocument and SimplePDFViewer to pull all the text
    #_pdf = open(u_file, "rb")
    u_pdf = open('temp_pdf.pdf', "rb")
    doc = PDFDocument(u_pdf)
    reader = SimplePDFViewer(u_pdf)

    print('counting pages in pdf')
    pgs = [p for p in doc.pages()]  #count number of pages
    page_ct = len(pgs)

    print('cycling through pages')
    with open(u_name + '.txt', 'w') as g:
        for pg in range(page_ct):  #cycle through pages
            reader.navigate(pg + 1)
            reader.render()
            if (pg + 1) % 10 == 0:
                print('processing page ' + str(pg + 1) + ' of ' + str(page_ct))
            st = reader.canvas.strings  #list with 1 line per element
            for l in range(len(st)):
                ln = st[l].encode('ascii', 'replace').decode(
                    'utf-8')  #turn unknown chars into ?
                g.write(ln + '\n')

    group_exp = '^\s(.{8})\s(.{17})\s\s(.{8})\s\s(.{10})\s\s(.{21})\s(.{10})\s(.{18})\s(.{12})(.+)?$'  #regex for grouping an invoice..
    group_inv = re.compile(group_exp)  #set as a regex
    acct_exp = 'ACCOUNT.+(\d{6}).+DEPARTMENT\s+(\w{8})'  #find acct and center
    find_acct = re.compile(acct_exp)
    vend_exp = '^([\w]{10})\s+(\S.+\S)\s+$'  #find 10 'word' chars, then a space, then everything up to the line break
    find_vendor = re.compile(vend_exp)
    corp_exp = 'COMPANY\s(\w{4})\s+DATE'  #find the 4 characters between COMPANY and DATE
    find_corp = re.compile(corp_exp)

    corp = 'XXXX'
    center = 'XXXXXXXX'
    account = 'XXXXXX'
    v_short = 'XXXX'
    v_long = 'XXXXXXXX'

    data_tmp = []

    ct = 0

    with open(u_name + '.txt', 'r') as h:
        for line in h:
            ln = str(line)  #remove leading/trailing spaces and newline chars
            if find_corp.search(ln):  #look for a new corp
                if corp != find_corp.search(ln).group(1):
                    corp = find_corp.search(ln).group(1)
            elif find_vendor.search(ln):  #look for a new vendor
                if v_short != str(find_vendor.search(ln).group(1)):
                    v_short = str(find_vendor.search(ln).group(1))
                    v_long = str(find_vendor.search(ln).group(2))
            elif find_acct.search(ln):  #look for a new acct/center
                if center != find_acct.search(ln).group(
                        2) or account != find_acct.search(ln).group(1):
                    center = find_acct.search(ln).group(2)
                    account = find_acct.search(ln).group(1)
            elif is_inv(ln):  #look for an invoice
                tmp = group_inv.search(ln)  #print(is_inv(ln).groups())
                gl_eff = tmp.group(1).strip()
                inv_num = tmp.group(2).strip()
                inv_date = tmp.group(3).strip()
                po = tmp.group(4).strip()
                desc = tmp.group(5).strip()
                q = tmp.group(6).strip()
                if q == '':
                    qty = 0
                else:  #qty
                    qty = q.replace(',', '')
                prod_id = tmp.group(7).strip()
                if len(tmp.group(8).strip()) != 0:  #expense
                    exp = tmp.group(8).strip()
                    exp = float(str(exp).replace(',', ''))
                else:
                    exp = 0
                if len(tmp.group(9).strip()) != 0:  #expense
                    cred = tmp.group(9).strip()
                    cred = float(str(cred).replace(',', ''))
                else:
                    cred = 0
                if int(qty) == 0:
                    per_unit = 0
                else:
                    per_unit = round(float(exp) / float(qty), 2)
                new_row = [
                    corp, center, account, v_short, v_long, gl_eff, inv_num,
                    inv_date, po, desc, qty, prod_id, exp, cred, per_unit
                ]
                data_tmp.append(new_row)
                ct += 1
                if ct % 1000 == 0:
                    print('Finished adding row ' + str(ct))

    data_cols = [
        'Company', 'Center', 'Account', 'Vendor_Short', 'Vendor_Long',
        'GL_Effective_Date', 'Inv_Number', 'Inv_Date', 'PO', 'Description',
        'Qty', 'ProdID', 'Expense', 'Credit', 'Per_Unit_Cost'
    ]
    col_widths = [14, 12, 12, 18, 35, 22, 20, 13, 12, 31, 10, 20, 15, 15, 18]
    data_inv = pd.DataFrame(data=data_tmp, columns=data_cols)
    data_inv['GL_Effective_Date'] = pd.to_datetime(
        data_inv['GL_Effective_Date'])
    data_inv['Inv_Date'] = pd.to_datetime(data_inv['Inv_Date'])
    data_inv['Account'] = data_inv['Account'].astype('int64')
    data_inv['Qty'] = data_inv['Qty'].astype('int64')
    i_rows = data_inv['Company'].size

    with pd.ExcelWriter(u_name + '.xlsx',
                        engine='xlsxwriter',
                        datetime_format='m/d/yyyy') as writer:
        data_inv.to_excel(writer, sheet_name='DATA', index=False)
        workbook = writer.book
        worksheet = writer.sheets['DATA']
        curr_format = workbook.add_format(
            {'num_format': '$#,##0.00;[Red]($#,##0.00)'})
        worksheet.set_column(12, 12, 13,
                             curr_format)  #first col, last col, width, format
        worksheet.set_column(13, 13, 13, curr_format)
        worksheet.autofilter('A1:O' + str(i_rows + 1))
        worksheet.freeze_panes(1, 0)  #freeze 1st row
        for a in range(len(col_widths)):
            worksheet.set_column(a, a, col_widths[a])

    print('DONE - pulled ' + str(i_rows) + ' lines into ' + u_name + '.xlsx')
Пример #17
0
import pdfreader
from pdfreader import PDFDocument, SimplePDFViewer
from functions import cleanString
from ItemsClass import Item
from datetime import datetime
import json
import re

fd = open("order1.pdf", "rb")
doc = PDFDocument(fd)
numpages = len([p for p in doc.pages()])
viewer = SimplePDFViewer(fd)
strings = []
for num in range(0, numpages):
    viewer.navigate(num + 1)
    viewer.render()
    strings += viewer.canvas.strings[4:]

print(strings)

prev = ""
Items = []
item = Item()
Order = {
    "Items": [],
    "Request": "",
    "Total": "",
    "Customer": "",
    "Delivery": ""
}
requesting = False
Пример #18
0
def main():
    """Docstring will go here"""
    print('*' * 40 + '\nERT PDF TO TXT CONVERTER\n')
    print('Showing PDF files in ' + os.getcwd())
    files = []

    f_count = 0
    for file in os.listdir():
        if file.endswith(".pdf") or file.endswith(".PDF"):
            f_name = file.rsplit('.', maxsplit=1)[0]
            print('(' + str(f_count) + ')  ' + file)
            files.append(f_name)  #take the file name w/o the .pdf
            f_count += 1

    prompt = 'Enter the number corresponding to the target pdf, or q to quit: '
    choice = input(prompt)
    while is_valid(choice, f_count) == False:
        if choice == 'q':
            print('Quitting.')
            sys.exit()
        else:
            print('Invalid choice - try again.')
            choice = input(prompt)
    if choice != 'q':
        pdf_to_open = str(files[int(choice)]) + '.pdf'
        print(pdf_to_open)

    #open the protected pdf and remove the password
    print('Converting to unlocked PDF')
    p_pdf = open(pdf_to_open, 'rb')  #this will change
    pdfReader = PyPDF2.PdfFileReader(p_pdf)
    pdfReader.decrypt('')
    pdfWriter = PyPDF2.PdfFileWriter()

    #write all the pages in the unlocked file to a new pdf
    for pageNum in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)

    u_name = input('Output file name? (excluding ".txt"):  '
                   )  #prompt for the name of the new unlocked pdf
    #print(Path(u_name + '.txt').is_file())
    #sys.exit()

    pdfOutputFile = open('temp_pdf.pdf', 'wb')
    pdfWriter.write(pdfOutputFile)

    pdfOutputFile.close()
    p_pdf.close()

    #open the unlocked pdf.  We'll use PDFDocument and SimplePDFViewer to pull all the text
    u_pdf = open('temp_pdf.pdf', "rb")
    doc = PDFDocument(u_pdf)
    reader = SimplePDFViewer(u_pdf)
    start_time = time.time()
    pgs = [p for p in doc.pages()]  #count number of pages
    page_ct = len(pgs)
    print('Writing ' + str(page_ct) + ' pages to ' + u_name + '.txt ...')

    with open(u_name + '.txt', 'w') as g:
        for pg in range(page_ct):  #cycle through pages
            reader.navigate(pg + 1)
            reader.render()
            if (pg + 1) % 10 == 0:
                print('Processing page ' + str(pg + 1) + ' of ' + str(page_ct))
            st = reader.canvas.strings  #list with 1 line per element
            for l in range(len(st)):
                ln = st[l].encode('ascii',
                                  'replace')  #turn unknown chars into ?
                ln = ln.decode('ascii', 'strict')
                g.write(ln + '\n')

    u_pdf.close()
    os.remove('temp_pdf.pdf')

    print('Saved as ' + u_name + '.txt')
    print("This took %s seconds." % round((time.time() - start_time), 2))
Пример #19
0
async def get_election_offices():
    async with aiohttp.ClientSession() as session:
        async with session.get(URL) as r:
            text = await r.read()

    # Prep helper vars
    phone, office_supervisor, website, location_name, county_name = ("", ) * 5

    doc = PDFDocument(text)
    viewer = SimplePDFViewer(text)
    physical_address, mailing_address = ({}, ) * 2
    election_offices = []
    for i, page in enumerate(doc.pages(), 1):
        viewer.navigate(i)
        viewer.render()
        # This is parsed in the order at which pdf elements are read by the viewer.
        for j, s in enumerate(viewer.canvas.strings):
            if not county_name:
                m = re.search(r"\D+(?=\s-)", s)
                if m:
                    county_name = m.group(0).split(maxsplit=1)[0].capitalize()
                    location_name = f"{county_name} Election Office"

            mapping = electionsaver.addressSchemaMapping

            if not physical_address:
                m = re.search(r"(?<=MUNICIPAL ADDRESS :).*", s)
                if m:
                    physical_address = usaddress.tag(
                        f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(),
                        tag_mapping=mapping,
                    )[0]
                    physical_address["state"].upper()
                    physical_address["locationName"] = location_name
            if not mailing_address:
                m = re.search(r"(?<=MAILING ADDRESS :).*", s)
                if m:
                    mailing_address = usaddress.tag(
                        f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(),
                        tag_mapping=mapping,
                    )[0]
                    mailing_address["state"].upper()
                    mailing_address["locationName"] = location_name
            if not phone:
                m = re.search(r"(?<=Phone 1: ).*", s)
                if m:
                    phone = m.group(0)
                    election_offices.append({
                        "countyName": county_name,
                        "physicalAddress": physical_address,
                        "mailingAddress": mailing_address,
                        "phone": phone,
                        "officeSupervisor": office_supervisor,
                        "supervisorTitle": "County Clerk",
                        "website": website,
                    })
                    # reset for next round
                    phone, office_supervisor, website, location_name, county_name = (
                        "", ) * 5
            if not office_supervisor:
                m = re.search(r"(?<=COUNTY CLERK: ).*", s)
                if m:
                    office_supervisor = m.group(0).title()
            if not website:
                m = re.search(r"http.*", s)
                if m:
                    website = m.group(0)

    with open(
            os.path.join(ROOT_DIR, "scrapers", "wisconsin", "wisconsin.json"),
            "w") as f:
        json.dump(election_offices, f)
    return election_offices