def cmap_extraction_example(): pdf_filepath = './tutorial-example.pdf' try: fd = open(pdf_filepath, 'rb') doc = PDFDocument(fd) from itertools import islice page = next(islice(doc.pages(), 2, 3)) print('page.Resources.Font = {}.'.format(page.Resources.Font)) print('len(page.Resources.Font) = {}.'.format(len(page.Resources.Font))) font = page.Resources.Font['R26'] print('font.Subtype = {}, bool(font.ToUnicode) = {}.'.format(font.Subtype, bool(font.ToUnicode))) # It is PostScript Type1 font, and texts use CMap provided by ToUnicode attribute. # Font's ToUnicode attribute contains a reference to the CMap file data stream. cmap = font.ToUnicode print('type(cmap) = {}.'.format(type(cmap))) print('cmap.Filter = {}.'.format(cmap.Filter)) data = cmap.filtered with open('./sample-cmap.txt', 'wb') as fd2: fd2.write(data) finally: fd.close()
def __init__(self, pdf_path): self.pdf_path = pdf_path fd = open(pdf_path, "rb") doc = PDFDocument(fd) self.viewer = SimplePDFViewer(fd) self.pages = [p for p in doc.pages()]
def get_text_pypdf(DOI:str) -> str: try: """gets the text from a given DOI""" hostname = socket.gethostname() path = pathlib.Path(__file__).parent.absolute() name = hostname + str(DOI).replace("/", "") + ".pdf" fp = Path(path / "pdfs" / name) # build filepath url = "https://www.medrxiv.org/content/" + str(DOI) + "v1.full.pdf" # build url response = requests.get(url) fp.write_bytes(response.content) # save .pdf fd = open(str(path) + "/pdfs/" + name, "rb") # open with pdfreader doc = PDFDocument(fd) all_pages = [p for p in doc.pages()] # get pages viewer = SimplePDFViewer(fd) # use simple viwer text = "" for p in range(len(all_pages)): # for each page viewer.navigate(p + 1) # nav to page try: viewer.render() # render -> clean and strip text += (u"".join(viewer.canvas.strings).encode(sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n' except OverflowError: pass fd.close() return text.lower() except Exception as e: print(e, DOI) return ""
def encrypted_and_password_protected_pdf_tutorial(): pdf_filepath = './encrypted-with-qwerty.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd, password='******') viewer.render() text = ''.join(viewer.canvas.strings) print('text = {}.'.format(text)) #-------------------- doc = PDFDocument(fd, password='******') page_one = next(doc.pages()) print('page_one.Contents = {}.'.format(page_one.Contents)) #-------------------- try: doc = PDFDocument(fd, password='******') #viewer = SimplePDFViewer(fd, password='******') except ValueError as ex: print('ValueError raised: {}.'.format(ex)) finally: fd.close()
def xobject_image_example(): pdf_filepath = './example-image-xobject.pdf' try: fd = open(pdf_filepath, 'rb') doc = PDFDocument(fd) # Extract XObject image. page = next(doc.pages()) print('page.Resources.XObject = {}.'.format(page.Resources.XObject)) xobj = page.Resources.XObject['img0'] print('xobj.Type = {}, xobj.Subtype = {}.'.format(xobj.Type, xobj.Subtype)) pil_image = xobj.to_Pillow() #pil_image.save('./extract-logo.png') #-------------------- # Extract Images: a very simple way. viewer = SimplePDFViewer(fd) viewer.render() all_page_images = viewer.canvas.images if 'img0' in all_page_images: img = all_page_images['img0'] print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype)) all_page_inline_images = viewer.canvas.inline_images if all_page_inline_images: img = all_page_inline_images[0] print('img.Type = {}, img.Subtype = {}.'.format(img.Type, img.Subtype)) finally: fd.close() #-------------------- pdf_filepath = './tutorial-example.pdf' try: fd = open(pdf_filepath, 'rb') viewer = SimplePDFViewer(fd) # Extract image masks. viewer.navigate(5) viewer.render() inline_images = viewer.canvas.inline_images image_mask = next(img for img in inline_images if img.ImageMask) pil_img = image_mask.to_Pillow() #pil_img.save('./mask.png') finally: fd.close()
def document_tutorial(): pdf_filepath = './tutorial-example.pdf' from io import BytesIO with open(pdf_filepath, 'rb') as fd: stream = BytesIO(fd.read()) doc = PDFDocument(stream) try: fd = open(pdf_filepath, 'rb') doc = PDFDocument(fd) print('doc.header.version = {}.'.format(doc.header.version)) print('doc.metadata = {}.'.format(doc.metadata)) print('doc.root.Type = {}.'.format(doc.root.Type)) print('doc.root.Metadata.Subtype = {}.'.format(doc.root.Metadata.Subtype)) print('doc.root.Outlines.First["Title"] = {}.'.format(doc.root.Outlines.First['Title'])) #-------------------- # Browse document pages. page_one = next(doc.pages()) all_pages = [p for p in doc.pages()] print('len(all_pages) = {}.'.format(len(all_pages))) page_six = next(itertools.islice(doc.pages(), 5, 6)) page_five = next(itertools.islice(doc.pages(), 4, 5)) page_eight = all_pages[7] print('page_six.MediaBox = {}.'.format(page_six.MediaBox)) print('page_six.Annots[0].Subj = {}.'.format(page_six.Annots[0].Subj)) print('page_six.Parent.Type = {}.'.format(page_six.Parent.Type)) print('page_six.Parent.Count = {}.'.format(page_six.Parent.Count)) print('len(page_six.Parent.Kids) = {}.'.format(len(page_six.Parent.Kids))) finally: fd.close()
def navigate_pages(doc: PDFDocument, viewer: SimplePDFViewer): for i, page in enumerate(doc.pages(), 1): # navigate to page viewer.navigate(i) # render the page viewer.render() # collapse that ass page_strings: List[str] = viewer.canvas.strings.copy() merge_ranges = get_line_ranges(strings_list=page_strings) page_strings = establish_uniformity(strings_list=page_strings, line_range_list=merge_ranges) get_county_election_office_info(strings_list=page_strings)
def init_cmb_from_pdf(month): filename = FILE_PATH.format(str(month).zfill(2)) # logger.info(filename) fd = open(filename, "rb") doc = PDFDocument(fd) all_pages = [p for p in doc.pages()] # logger.info(len(all_pages)) viewer = SimplePDFViewer(fd) records = [] for i in range(len(all_pages)): viewer.navigate(i+1) viewer.render() records = np.append(records, viewer.canvas.strings[4:]) head = np.where(records == '记账日')[0][0] tail = np.where(records == '本期还款总额')[0][-1] records = records[head:tail] # title_cn = records[:5] # title_en = records[5:11] records = records[11:] column_cn = ['交易日' '交易摘要' '人民币金额' '卡号末四位' '记账日' '交易地金额'] column_en = ['transaction_date', 'transaction_description', 'transction_amount', 'card_number', 'bill_date', 'str_rmb'] # Data: ['' '掌上生活还款' '-3,011.49' '9978' '07/24' '-3,011.49'] df = pd.DataFrame(records.reshape( [int(len(records)/6), 6]), columns=column_en) df['type'] = 'cmb' df['transaction_date'] = df['transaction_date'].apply( lambda _: '2020/' + _) df['transaction_date'] = pd.to_datetime( df['transaction_date'], format="%Y/%m/%d", errors='coerce') df['transction_amount'] = df['transction_amount'].apply( lambda _: decimal_from_value(_)) df = df[['transaction_date', 'transction_amount', 'transaction_description', 'type']] return df
def grade_document(document, verbose = False, point_flags = ('%','%') ) : doc = PDFDocument( document ) viewer = SimplePDFViewer( document ) grade = 0 for page_number, page in enumerate( doc.pages() ) : if verbose : print('------------------') print('Page:', page_number) viewer.navigate( page_number + 1 ) viewer.render() grade += grade_page( viewer.canvas, verbose = verbose, point_flags = point_flags ) return grade
def get_text(DOI: str) -> str: txt = "" name = "curr." fp = Path(Path.cwd() / "pdfs" / "curr.pdf") # build filepath url = "https://www.medrxiv.org/content/" + DOI + "v1.full.pdf" # build url response = requests.get(url) fp.write_bytes(response.content) # save .pdf fd = open(r"pdfs\curr.pdf", "rb") # open with pdfreader doc = PDFDocument(fd) all_pages = [p for p in doc.pages()] # get pages viewer = SimplePDFViewer(fd) # use simple viwer for p in range(len(all_pages)): # for each page viewer.navigate(p + 1) # nav to page viewer.render() # render -? clean and strip txt += (u"".join(viewer.canvas.strings).encode( sys.stdout.encoding, errors='replace').decode("windows-1252")) + '\n' return txt
def font_extraction_example(): pdf_filepath = './example-font.pdf' try: fd = open(pdf_filepath, 'rb') doc = PDFDocument(fd) page = next(doc.pages()) print('sorted(page.Resources.Font.keys()) = {}.'.format(sorted(page.Resources.Font.keys()))) font = page.Resources.Font['T1_0'] print('font.Subtype = {}, font.BaseFont = {}, font.Encoding = {}.'.format(font.Subtype, font.BaseFont, font.Encoding)) font_file = font.FontDescriptor.FontFile print('type(font_file) = {}.'.format(type(font_file))) print('font_file.Filter = {}.'.format(font_file.Filter)) data = font_file.filtered #with open('./sample-font.type1', 'wb') as fd2: # fd2.write(data) finally: fd.close()
def GradeDocSafe(document, verbose=False, point_flags=defaut_point_flags): ''' grade = GradeDoc(document, verbose = False, point_flags = defaut_point_flags ) ''' doc = PDFDocument(document) viewer = SimplePDFViewer(document) grade = 0 for page_number, page in enumerate(doc.pages()): if verbose: print('------------------') print('Page:', page_number + 1) viewer.navigate(page_number + 1) viewer.render() grade += GradePage(viewer.canvas.strings, verbose=verbose, point_flags=point_flags) return grade
# # df = read_pdf("Activity_Report.pdf") # tabula.convert_into("Activity_Report.pdf", "output.csv", output_format="csv", pages='all') # import pdftables_api # c = pdftables_api.Client('r0tedshcbejj') # c.xlsx('Acentria Activity Report.pdf', 'Acentria Activity Report.xlsx') # c.xlsx('Integration/Tam_Weaver/RWI Policy Types Sample 0.pdf', 'Integration/Tam_Weaver/RWI_PolicyTypes_Sample0.xlsx') import pdfreader from pdfreader import PDFDocument, SimplePDFViewer fd = open("Acentria Activity Report.pdf", "rb") doc = PDFDocument(fd) page = next(doc.pages()) print(doc.root) # df = tabula.read_pdf('Acentria Activity Report.pdf', pages = 3, lattice = True)[1] # import os # import sys # import pdftables_api # from PyPDF2 import PdfFileWriter, PdfFileReader # if len(sys.argv) < 3: # command = os.path.basename(__file__) # sys.exit('Usage: {} pdf-file page-number, ...'.format(command)) # pdf_input_file = sys.argv[1]; # pages_args = ",".join(sys.argv[2:]).replace(" ","") # pages_required = [int(p) for p in filter(None, pages_args.split(","))]
if __name__ == "__main__": fd = open("bradford_results_2020.pdf", "rb") doc = PDFDocument(fd) viewer = SimplePDFViewer(fd) parties = ["DEM", "REP", "NPA"] offices = [ "PRESIDENT OF THE UNITED STATES", "ATTORNEY GENERAL", "AUDITOR GENERAL", "STATE TREASURER" ] presidential_candidates = [ "BERNIE SANDERS", "JOSEPH R. BIDEN", "TULSI GABBARD", "DONALD TRUMP (W)", "Total", "Write-in", "DONALD J. TRUMP", "ROQUE ROCKY DE LA FUENTE", "BILL WELD", "BERNIE SANDERS (W)" ] all_pages = [p for p in doc.pages()] with open('20200602__pa__primary__bradford__precinct.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow([ "county", "district", "office", "vote type", "party", "candidate", "election_day", "absentee", "mail-in", "provisional" ]) for i in range(len((all_pages))): #for i in range(3): viewer.navigate(i + 1) viewer.render() text_on_page = viewer.canvas.strings #print(text_on_page) scraping_one_page(text_on_page, writer) print("We are " + str((i / len(all_pages)) * 100) + "% done.")
def main(): # Parse the arguments parser = argparse.ArgumentParser() parser.add_argument("file_name") parser.add_argument("-o", "--output", default=None, help="sets the output directory") parser.add_argument("-v", "--verbose", default=False, type=str2bool, const=True, nargs='?', help="increase output verbosity") parser.add_argument("-fp", "--first_page", default=0, help="first page to extract from") parser.add_argument("-lp", "--last_page", default=1000, help="last page to extract from") parser.add_argument("-mw", "--min_width", default=200, help="minimum pixel width") parser.add_argument("-mh", "--min_height", default=200, help="minimum pixel height") parser.add_argument("-xw", "--max_width", default=1210, help="maximum pixel width") parser.add_argument("-xh", "--max_height", default=1570, help="maximum pixel height") parser.add_argument("-mt", "--make_transparent", default=True, type=str2bool, const=False, nargs='?', help="flag to make the background transparent") parser.add_argument("-wt", "--white_to_trans", default=True, type=str2bool, const=False, nargs='?', help="turn white pixels transparent") parser.add_argument("-bt", "--black_to_trans", default=True, type=str2bool, const=False, nargs='?', help="turn black pixels transparent") parser.add_argument("-wf", "--white_fuzz", default=1, help="fuzz percent (0-100) for white transparency") parser.add_argument("-bf", "--black_fuzz", default=1, help="fuzz percent (0-100) for black transparency") parser.add_argument("-ims", "--image_string", default="Im", help="string that appears in all image names") args = parser.parse_args() if args.verbose: print(f"Args:\n\t{args}") # Obtain the base filename file_name = args.file_name assert os.path.exists(file_name) assert file_name[-4:] == ".pdf", "must provide '.pdf' file" base_file_name = file_name[:-4] # Split on slashes base_file_name = base_file_name.split("/")[-1] base_file_name = base_file_name.split("\\")[-1] assert len(base_file_name) > 0 # Make the output directory if args.output is not None: output = args.output else: output = base_file_name + "_images" if args.verbose: print(f"No output file given; outputing to {output}/") os.makedirs(output, exist_ok=True) # Import the pdfreader fd = open(file_name, "rb") doc = PDFDocument(fd) # Check pages assert args.first_page > -1 assert args.last_page > -1 assert args.last_page > args.first_page # Loop over pages for i, page in enumerate(doc.pages()): if i < args.first_page: continue if i >= args.last_page: exit() if args.verbose: nkeys = len(page.Resources.XObject.keys()) print(f"On page {i} -- {nkeys} XObjects detected") # Loop over possible image objects for key in page.Resources.XObject.keys(): if args.image_string in key or "im" in key: xobj = page.Resources.XObject[key] try: pil_image = xobj.to_Pillow() except IndexError: if args.verbose: print( f"IndexError raised on page {i} {key} - skipping" ) continue width, height = pil_image.size if width < args.max_width and height < args.max_height: if width > args.min_width and height > args.min_height: if args.verbose: print( f"Saving image {key} on page{i}: "+\ f"(w,h)={pil_image.size}" ) pil_image.save(f"{output}/page{i}_{key}.png") if args.make_transparent: _do_transparent(args, i, key, pil_image, output) return
def main(): print('Opening pdf and writing to decrypted copy') p_pdf = open('AP12176A_20200701_142127.pdf', 'rb') #this will change pdfReader = PyPDF2.PdfFileReader(p_pdf) pdfWriter = PyPDF2.PdfFileWriter() pdfReader.decrypt('') #write all the pages in the unlocked file to a new pdf print('Writing to decrypted copy.') for pageNum in range(pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) u_name = input('File name for decrypted copy? (excluding ".pdf"): ' ) #prompt for the name of the .txt output file u_file = u_name + '.pdf' #pdfOutputFile = open(u_file, 'wb') print('Finishing writing to decrypted copy') pdfOutputFile = open('temp_pdf.pdf', 'wb') pdfWriter.write(pdfOutputFile) #write to the temporary unlocked pdf pdfOutputFile.close() p_pdf.close() #open the unlocked pdf. We'll use PDFDocument and SimplePDFViewer to pull all the text #_pdf = open(u_file, "rb") u_pdf = open('temp_pdf.pdf', "rb") doc = PDFDocument(u_pdf) reader = SimplePDFViewer(u_pdf) print('counting pages in pdf') pgs = [p for p in doc.pages()] #count number of pages page_ct = len(pgs) print('cycling through pages') with open(u_name + '.txt', 'w') as g: for pg in range(page_ct): #cycle through pages reader.navigate(pg + 1) reader.render() if (pg + 1) % 10 == 0: print('processing page ' + str(pg + 1) + ' of ' + str(page_ct)) st = reader.canvas.strings #list with 1 line per element for l in range(len(st)): ln = st[l].encode('ascii', 'replace').decode( 'utf-8') #turn unknown chars into ? g.write(ln + '\n') group_exp = '^\s(.{8})\s(.{17})\s\s(.{8})\s\s(.{10})\s\s(.{21})\s(.{10})\s(.{18})\s(.{12})(.+)?$' #regex for grouping an invoice.. group_inv = re.compile(group_exp) #set as a regex acct_exp = 'ACCOUNT.+(\d{6}).+DEPARTMENT\s+(\w{8})' #find acct and center find_acct = re.compile(acct_exp) vend_exp = '^([\w]{10})\s+(\S.+\S)\s+$' #find 10 'word' chars, then a space, then everything up to the line break find_vendor = re.compile(vend_exp) corp_exp = 'COMPANY\s(\w{4})\s+DATE' #find the 4 characters between COMPANY and DATE find_corp = re.compile(corp_exp) corp = 'XXXX' center = 'XXXXXXXX' account = 'XXXXXX' v_short = 'XXXX' v_long = 'XXXXXXXX' data_tmp = [] ct = 0 with open(u_name + '.txt', 'r') as h: for line in h: ln = str(line) #remove leading/trailing spaces and newline chars if find_corp.search(ln): #look for a new corp if corp != find_corp.search(ln).group(1): corp = find_corp.search(ln).group(1) elif find_vendor.search(ln): #look for a new vendor if v_short != str(find_vendor.search(ln).group(1)): v_short = str(find_vendor.search(ln).group(1)) v_long = str(find_vendor.search(ln).group(2)) elif find_acct.search(ln): #look for a new acct/center if center != find_acct.search(ln).group( 2) or account != find_acct.search(ln).group(1): center = find_acct.search(ln).group(2) account = find_acct.search(ln).group(1) elif is_inv(ln): #look for an invoice tmp = group_inv.search(ln) #print(is_inv(ln).groups()) gl_eff = tmp.group(1).strip() inv_num = tmp.group(2).strip() inv_date = tmp.group(3).strip() po = tmp.group(4).strip() desc = tmp.group(5).strip() q = tmp.group(6).strip() if q == '': qty = 0 else: #qty qty = q.replace(',', '') prod_id = tmp.group(7).strip() if len(tmp.group(8).strip()) != 0: #expense exp = tmp.group(8).strip() exp = float(str(exp).replace(',', '')) else: exp = 0 if len(tmp.group(9).strip()) != 0: #expense cred = tmp.group(9).strip() cred = float(str(cred).replace(',', '')) else: cred = 0 if int(qty) == 0: per_unit = 0 else: per_unit = round(float(exp) / float(qty), 2) new_row = [ corp, center, account, v_short, v_long, gl_eff, inv_num, inv_date, po, desc, qty, prod_id, exp, cred, per_unit ] data_tmp.append(new_row) ct += 1 if ct % 1000 == 0: print('Finished adding row ' + str(ct)) data_cols = [ 'Company', 'Center', 'Account', 'Vendor_Short', 'Vendor_Long', 'GL_Effective_Date', 'Inv_Number', 'Inv_Date', 'PO', 'Description', 'Qty', 'ProdID', 'Expense', 'Credit', 'Per_Unit_Cost' ] col_widths = [14, 12, 12, 18, 35, 22, 20, 13, 12, 31, 10, 20, 15, 15, 18] data_inv = pd.DataFrame(data=data_tmp, columns=data_cols) data_inv['GL_Effective_Date'] = pd.to_datetime( data_inv['GL_Effective_Date']) data_inv['Inv_Date'] = pd.to_datetime(data_inv['Inv_Date']) data_inv['Account'] = data_inv['Account'].astype('int64') data_inv['Qty'] = data_inv['Qty'].astype('int64') i_rows = data_inv['Company'].size with pd.ExcelWriter(u_name + '.xlsx', engine='xlsxwriter', datetime_format='m/d/yyyy') as writer: data_inv.to_excel(writer, sheet_name='DATA', index=False) workbook = writer.book worksheet = writer.sheets['DATA'] curr_format = workbook.add_format( {'num_format': '$#,##0.00;[Red]($#,##0.00)'}) worksheet.set_column(12, 12, 13, curr_format) #first col, last col, width, format worksheet.set_column(13, 13, 13, curr_format) worksheet.autofilter('A1:O' + str(i_rows + 1)) worksheet.freeze_panes(1, 0) #freeze 1st row for a in range(len(col_widths)): worksheet.set_column(a, a, col_widths[a]) print('DONE - pulled ' + str(i_rows) + ' lines into ' + u_name + '.xlsx')
import pdfreader from pdfreader import PDFDocument, SimplePDFViewer from functions import cleanString from ItemsClass import Item from datetime import datetime import json import re fd = open("order1.pdf", "rb") doc = PDFDocument(fd) numpages = len([p for p in doc.pages()]) viewer = SimplePDFViewer(fd) strings = [] for num in range(0, numpages): viewer.navigate(num + 1) viewer.render() strings += viewer.canvas.strings[4:] print(strings) prev = "" Items = [] item = Item() Order = { "Items": [], "Request": "", "Total": "", "Customer": "", "Delivery": "" } requesting = False
def main(): """Docstring will go here""" print('*' * 40 + '\nERT PDF TO TXT CONVERTER\n') print('Showing PDF files in ' + os.getcwd()) files = [] f_count = 0 for file in os.listdir(): if file.endswith(".pdf") or file.endswith(".PDF"): f_name = file.rsplit('.', maxsplit=1)[0] print('(' + str(f_count) + ') ' + file) files.append(f_name) #take the file name w/o the .pdf f_count += 1 prompt = 'Enter the number corresponding to the target pdf, or q to quit: ' choice = input(prompt) while is_valid(choice, f_count) == False: if choice == 'q': print('Quitting.') sys.exit() else: print('Invalid choice - try again.') choice = input(prompt) if choice != 'q': pdf_to_open = str(files[int(choice)]) + '.pdf' print(pdf_to_open) #open the protected pdf and remove the password print('Converting to unlocked PDF') p_pdf = open(pdf_to_open, 'rb') #this will change pdfReader = PyPDF2.PdfFileReader(p_pdf) pdfReader.decrypt('') pdfWriter = PyPDF2.PdfFileWriter() #write all the pages in the unlocked file to a new pdf for pageNum in range(pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) u_name = input('Output file name? (excluding ".txt"): ' ) #prompt for the name of the new unlocked pdf #print(Path(u_name + '.txt').is_file()) #sys.exit() pdfOutputFile = open('temp_pdf.pdf', 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() p_pdf.close() #open the unlocked pdf. We'll use PDFDocument and SimplePDFViewer to pull all the text u_pdf = open('temp_pdf.pdf', "rb") doc = PDFDocument(u_pdf) reader = SimplePDFViewer(u_pdf) start_time = time.time() pgs = [p for p in doc.pages()] #count number of pages page_ct = len(pgs) print('Writing ' + str(page_ct) + ' pages to ' + u_name + '.txt ...') with open(u_name + '.txt', 'w') as g: for pg in range(page_ct): #cycle through pages reader.navigate(pg + 1) reader.render() if (pg + 1) % 10 == 0: print('Processing page ' + str(pg + 1) + ' of ' + str(page_ct)) st = reader.canvas.strings #list with 1 line per element for l in range(len(st)): ln = st[l].encode('ascii', 'replace') #turn unknown chars into ? ln = ln.decode('ascii', 'strict') g.write(ln + '\n') u_pdf.close() os.remove('temp_pdf.pdf') print('Saved as ' + u_name + '.txt') print("This took %s seconds." % round((time.time() - start_time), 2))
async def get_election_offices(): async with aiohttp.ClientSession() as session: async with session.get(URL) as r: text = await r.read() # Prep helper vars phone, office_supervisor, website, location_name, county_name = ("", ) * 5 doc = PDFDocument(text) viewer = SimplePDFViewer(text) physical_address, mailing_address = ({}, ) * 2 election_offices = [] for i, page in enumerate(doc.pages(), 1): viewer.navigate(i) viewer.render() # This is parsed in the order at which pdf elements are read by the viewer. for j, s in enumerate(viewer.canvas.strings): if not county_name: m = re.search(r"\D+(?=\s-)", s) if m: county_name = m.group(0).split(maxsplit=1)[0].capitalize() location_name = f"{county_name} Election Office" mapping = electionsaver.addressSchemaMapping if not physical_address: m = re.search(r"(?<=MUNICIPAL ADDRESS :).*", s) if m: physical_address = usaddress.tag( f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(), tag_mapping=mapping, )[0] physical_address["state"].upper() physical_address["locationName"] = location_name if not mailing_address: m = re.search(r"(?<=MAILING ADDRESS :).*", s) if m: mailing_address = usaddress.tag( f"{m.group(0)} {viewer.canvas.strings[j + 1]}".title(), tag_mapping=mapping, )[0] mailing_address["state"].upper() mailing_address["locationName"] = location_name if not phone: m = re.search(r"(?<=Phone 1: ).*", s) if m: phone = m.group(0) election_offices.append({ "countyName": county_name, "physicalAddress": physical_address, "mailingAddress": mailing_address, "phone": phone, "officeSupervisor": office_supervisor, "supervisorTitle": "County Clerk", "website": website, }) # reset for next round phone, office_supervisor, website, location_name, county_name = ( "", ) * 5 if not office_supervisor: m = re.search(r"(?<=COUNTY CLERK: ).*", s) if m: office_supervisor = m.group(0).title() if not website: m = re.search(r"http.*", s) if m: website = m.group(0) with open( os.path.join(ROOT_DIR, "scrapers", "wisconsin", "wisconsin.json"), "w") as f: json.dump(election_offices, f) return election_offices