def generateCSV(filename): """ Read PDF files and then create a CSV equivalent. Arguments: filename -- PDF to read without .pdf extension """ print(".", end='') # Get path and number of pages n_pages = getPagesNumber(filename) file_path = f'api_covid19/files/{filename}.pdf' # Convert PDF to CSV print(".", end='') tables = camelot.read_pdf(file_path, pages=f'1-{n_pages}', split_text=True) print(".", end='') tables.export(f'api_covid19/files/intermediate_{filename}.csv', f='csv', compress=False) print(".", end='') # Merge generated CSV files into just one all_filenames = [i for i in sorted(glob.glob(f'api_covid19/files/intermediate_{filename}*.csv'))] combined_csv = pd.read_csv(all_filenames[0]) print(".", end='') for idx, f in enumerate(all_filenames): if idx > 0: df = pd.read_csv(f, header=None) df.columns = combined_csv.columns combined_csv = combined_csv.append(df) print(".", end='') combined_csv.to_csv(f'api_covid19/files/{filename}.csv', index=False, encoding='utf-8-sig') # Finally remove intermediate CSV files for f in all_filenames: os.remove(f)
def bankB(self): #Read bankB statement tables = camelot.read_pdf(self.path, flavor='stream', columns=['72,95,209,327,442,529'],table_areas=['0,792,800,100']) #Export pages of bankB pdf tables.export('./bankB.csv', f='csv') #read data from the csv file df=pandas.read_csv('./bankB-page-1-table-1.csv',skiprows=4) # Merge Information and Empty Columns df['InformationReplacing'] = df['Information'].fillna(df['Unnamed: 2']) # drop the Information and Unamed columns df['Information'] = df['InformationReplacing'] df.to_csv('./tempB1.csv') #read tempB1 file df1=pandas.read_csv('./tempB1.csv') #Drop Unnecessary columns for idx,columnName in enumerate(df1.columns): if("Unnamed" in columnName): df1.drop(columnName,axis=1,inplace=True) df1.drop('InformationReplacing',axis=1,inplace=True) #output bankB Solution df1.to_csv('./bankBSolution.csv') os.remove('./bankB-page-1-table-1.csv') os.remove('./tempb1.csv')
def bankA(self): #Read bankA statement tables = camelot.read_pdf(self.path,pages='all', flavor='stream') #Export each page of bankA pdf tables.export('./bank.csv', f='csv') self.removeExtraColumnsBankA() self.concatenateBankA()
def bankC(self): #Read bankC statement #table area to keep only top table tables = camelot.read_pdf(self.path, flavor='stream',table_areas=['0,792,800,400']) #Export pages of bankC tables.export('./bankC.csv', f='csv') #read csv file df=pandas.read_csv('./bankC-page-1-table-1.csv',skiprows=1) #output bankC Solution df.to_csv('./bankCSolution.csv') os.remove('./bankC-page-1-table-1.csv')
def detect_tables(opt): pdf_file = opt.pdf_path pg = see_example = False img_path = pdf_file[:-4] + "-" + str(pg) + ".jpg" pdf_page = norm_pdf_page(pdf_file, pg) img = pdf_page2img(pdf_file, pg, save_image=True) opt = parameters(img_path) output_detect = detectTable(opt) output = outpout_yolo(output_detect) os.remove(img_path) os.rmdir("outputs") if see_example: for out in output: [[x1_img, y1_img, x2_img, y2_img], [w_table, h_table], [H_img, W_img]] = img_dim(img, out) plt.plot([x1_img, x2_img, x2_img, x1_img, x1_img], [y1_img, y1_img, y2_img, y2_img, y1_img], linestyle='-.', alpha=0.7) # plt.scatter([x1_img, x2_img], [y1_img, y2_img]) imgplot = plt.imshow(img) plt.savefig(pdf_file[:-4] + "-" + str(pg) + ".png") interesting_areas = [] for x in output: [x1, y1, x2, y2] = bboxes_pdf(img, pdf_page, x) bbox_camelot = [ ",".join([str(x1), str(y1), str(x2), str(y2)]) ][0] # x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space interesting_areas.append(bbox_camelot) output_camelot = camelot.read_pdf(filepath=pdf_file, pages=str(pg), flavor="stream", table_areas=interesting_areas) output_camelot = [x.df for x in output_camelot] for i, db in enumerate(output_camelot): db.to_excel(pdf_file[:-4] + "-" + str(pg) + "-table-" + str(i) + ".xlsx")
def main(): from import read_pdf filename = "/Users/vijender/indvision-data/acctstmt_d_xxxxxx591m_emailacctstmt_unlckd.pdf" layouts, pq_obj, dimensions = faster_load.load_pdf_and_layout(filename, {}) preprocess_kwargs = {'layouts': layouts, 'dimensions': dimensions} # for layout in layouts[0]: # print(layout.text) tables = read_pdf(filepath=filename, flavor='stream', table_areas=["0,448,605,390"], pages="1", row_tol=5, column_tol=0, edge_tol=100, num_columns=7, preprocess_kwargs=preprocess_kwargs) print(tables) for table in tables: data = table.df pprint(data)
def read_pdf(filepath, pages: str = "1", password: str = '', flavor: str = "camelotPro", pro_kwargs: dict = None, suppress_stdout: bool = False, layout_kwargs: dict = None, **kwargs): """ Read PDF and return extracted tables. Parameters described below are exclusive for CamelotPro. Please refer to the docstrings from Camelot.read_pdf for information on other parameters <> Parameters ---------- flavor : str (default: 'lattice') [Case-Insensitive] The parsing method to use ('lattice' or 'stream' or 'CamelotPro'). pro_kwargs: dict, Must Need (if flavor is "CamelotPro") A dict of ( { "api_key": str, Mandatory, to trigger "CamelotPro" flavor, to process Scan PDFs and images, also text PDF files "job_id": str, empty, to process a new file Mandatory, to retrieve the result of the already submitted file "dup_check": bool, default: False - to bypass the duplicate check Useful to handle duplicate requests, check based on the FileName "max_wait_time": int, default: 300 Checks for the output every 15 seconds until successfully processed or for a maximum of 300 seconds. } ) Returns ------- tables : camelot.core.TableList """ pro_flavors = tuple(["camelotpro", "camelot_pro", "pro"]) if pro_kwargs is None: pro_kwargs = {} flavor = flavor.lower() if flavor in pro_flavors or any( [kwa.lower() in pro_flavors for kwa in kwargs]): if kwargs.pop("password", ""): raise IOError( "Pro version does not support the password protected files") max_wait_time = int(pro_kwargs.pop("max_wait_time", 300)) dup_check = pro_kwargs.pop("dup_check", False) et_sess = ExtractTable(api_key=pro_kwargs["api_key"]) if not pro_kwargs.get("job_id", ""): et_sess.process_file(filepath, pages=pages, output_format="df", dup_check=dup_check, max_wait_time=max_wait_time, library="camelotpro") else: et_sess.get_result(pro_kwargs["job_id"], max_wait_time=max_wait_time) gp_resp = et_sess.ServerResponse.json() from camelot_pro.doppelganger import table_list tables = table_list(gp_resp) else: from import read_pdf tables = read_pdf(filepath=filepath, pages=pages, password=password, flavor=flavor, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs if layout_kwargs else {}, **kwargs) if not tables: notify(try_pro) return tables
def read_pdf(filepath, pages="1", password=None, flavor="lattice", suppress_stdout=False, layout_kwargs={}, pro_kwargs=None, **kwargs): """ Read PDF and return extracted tables. Parameters described below are exclusive for CamelotPro. Please refer to the docstrings from Camelot.read_pdf for information on other parameters <> Parameters ---------- flavor : str (default: 'lattice') [Case-Insensitive] The parsing method to use ('lattice' or 'stream' or 'CamelotPro'). pro_kwargs: dict, Must Need (if flavor is "CamelotPro") A dict of ( { "api_key": str, Mandatory, to trigger "CamelotPro" flavor, to process Scan PDFs and images, also text PDF files "job_id": str, optional, if processing a new file Mandatory, to retrieve the result of already submitted file "dup_check": bool, default: False - to bypass the duplicate check Useful to handle duplicate requests, check based on the FileName "wait_for_output": bool, default: True Loops and check for the output for a maximum of 300 seconds, before the process exits as an output. with 20 second gap in between retries - If the process will return the output before 300 seconds, when the processing is successful - Alternatively, a big file process can always be tracked using the ".JobId" from the output } ) Returns ------- tables : camelot.core.TableList """ if pro_kwargs is None: pro_kwargs = {} flavor = flavor.lower() if flavor == "camelotpro": from camelot_pro.gopro import GoPro from camelot_pro.doppelganger import table_list going_pro = GoPro(pro_kwargs.get("api_key", "")) gone_pro = going_pro.validate_api_key() if not pro_kwargs.get("job_id", ""): gp_resp = gone_pro.trigger(filepath, pages, password=password, dup_check=pro_kwargs.get( "dup_check", False)) else: gp_resp = gone_pro.get_tables(pro_kwargs["job_id"]) # Added default wait time, because early users are confused of no output pro_kwargs["wait_for_output"] = pro_kwargs.get("wait_for_output", True) if gp_resp["JobStatus"].lower().startswith( "process") and pro_kwargs["wait_for_output"]: max_wait = 300 check_freq = 20 while max_wait > 0 and gp_resp["JobStatus"].lower().startswith( "process"): print( f'[Info]: Please wait, the Job is: {gp_resp["JobStatus"]} ..' ) max_wait -= check_freq time.sleep(check_freq) gp_resp = gone_pro.get_tables(job_id=gp_resp["JobId"]) tables = table_list(gp_resp) else: from import read_pdf tables = read_pdf(filepath=filepath, pages=pages, password=password, flavor=flavor, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs, **kwargs) if not tables: notify(try_pro) return tables
def run_stream_parse(): tables = io.read_pdf(sample_pdf_file, flavor='stream') print(tables);