def _getPDFTable(request, context): """ Mirrors the input and sends back the same data. :param request: iterable sequence of bundled rows :return: the same iterable sequence as received """ # empty parameters extraction_method = None path = None template = None for request_rows in request: # pull duals from each row, and the numData from duals for row in request_rows.rows: if extraction_method is None: extraction_method = [d.strData for d in row.duals][0] if path is None: path = [d.strData for d in row.duals][1] if template is None: template = [d.strData for d in row.duals][2] # read PDF with template if extraction_method == 'stream': df_list = read_pdf_with_template(path, template, stream=True) else: df_list = read_pdf_with_template(path, template, lattice=True) final_df = pd.DataFrame() count = 1 for df in df_list: df['tableID'] = str(count) final_df = pd.concat([final_df, df], axis=0, ignore_index=True) count = count + 1 columns = final_df.columns # iterate through df columns and format as SSE duals dualsList = [] for col in columns: tmpList = final_df[col].tolist() dualsList.append([ SSE.Dual(strData=d) if type(d) is str else SSE.Dual(numData=d) for d in tmpList ]) # create response rows response_rows = [] for i in range(len(tmpList)): duals = [dualsList[z][i] for z in range(len(dualsList))] response_rows.append(SSE.Row(duals=iter(duals))) # return response yield SSE.BundledRows(rows=response_rows)
def extract_tables(cls, fobj, page_rect: Iterable[PageRect]): data = [{ "page": pr.page.number + 1, "extraction_method": "guess", "columns": ",".join(map(str, pr.columns)), "x1": pr.rect.x0, "x2": pr.rect.x1, "y1": pr.rect.y0, "y2": pr.rect.y1, } for pr in page_rect if pr is not None] template = io.StringIO(json.dumps(data)) tables = map(cls.fix_table, tabula.read_pdf_with_template(fobj, template)) current = None for page in page_rect: if page is None: # Break. Yield the concatenated so far. if current is not None: yield current current = None else: # Continuation. Concat to current. try: latest = next(tables) if current is None: current = page.account, latest else: current = page.account, current[1].append( latest, ignore_index=True) except StopIteration: break if current is not None: yield current
def test_read_pdf_with_remote_template(self): template_path = ("https://github.com/chezou/tabula-py/raw/master/" "tests/resources/data.tabula-template.json") dfs = tabula.read_pdf_with_template(self.pdf_path, template_path) self.assertEqual(len(dfs), 4) self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1)))
def test_read_pdf_with_template(self): pdf_path = 'tests/resources/data.pdf' template_path = 'tests/resources/data.tabula-template.json' expected_csv1 = 'tests/resources/data_1.csv' dfs = tabula.read_pdf_with_template(pdf_path, template_path) self.assertEqual(len(dfs), 4) self.assertTrue(dfs[0].equals(pd.read_csv(expected_csv1)))
def test_read_pdf_with_binary_template(self): template_path = "tests/resources/data.tabula-template.json" with open(self.pdf_path, "rb") as pdf: with open(template_path, "rb") as template: dfs = tabula.read_pdf_with_template(pdf, template) self.assertEqual(len(dfs), 4) self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1)))
def to_csv(informeEPI,templateJSON): df = tabula.read_pdf_with_template(informeEPI,templateJSON,encoding='utf-8') comunas = dict() file_name = "informeEPI.csv" create_headers(file_name) for idx, dff in enumerate(df): key = 'tabla_' + str(idx + 1) comunas[key]=dff comunas[key].to_csv(file_name, mode='a', header=True, index=False, encoding="utf-8",float_format='%g')
def run(region, pdf, template): logger.debug( f'Start reading for region {region} from {pdf} with template {template}' ) tables = tabula.read_pdf_with_template(pdf, template, stream=True) mawaqit_for_wilayas = construct_mawaqit_for_wilayas(tables, region=region) for wilaya, mawaqit in mawaqit_for_wilayas.items(): logger.debug(f'Checking dates for {wilaya}') check_dates(mawaqit.index) logger.debug( f'Exporting wilayas {list(mawaqit_for_wilayas.keys())} to {settings.mawaqit_for_wilayas_dir}' ) export_mawaqit_for_wilayas(mawaqit_for_wilayas) logger.debug(f'Reading from {pdf} finished succesfully')
def read_table_with_template(file, template_file, template_name, fmt_func=None, args=None): if args is None: args = {} template = get_template(template_file, template_name) with open('template.json', 'w') as json_file: json_file.write(json.dumps(template)) table = tabula.read_pdf_with_template(file, 'template.json', pandas_options={'header': None})[0] if fmt_func: table = fmt_func(table, **args) return table
def test_read_pdf_with_dtype_string(self): pdf_path = "tests/resources/data_dtype.pdf" expected_csv = "tests/resources/data_dtype_expected.csv" expected_csv2 = "tests/resources/data_2-3.csv" template_path = "tests/resources/data_dtype.tabula-template.json" template_expected_csv = "tests/resources/data_dtype_template_expected.csv" pandas_options = {"dtype": str} self.assertTrue( tabula.read_pdf( self.pdf_path, stream=True, pages=1, multiple_tables=False, pandas_options=pandas_options.copy(), )[0].equals(pd.read_csv(self.expected_csv1, **pandas_options)) ) self.assertTrue( tabula.read_pdf( self.pdf_path, pages="2-3", stream=True, guess=False, multiple_tables=False, pandas_options=pandas_options.copy(), )[0].equals(pd.read_csv(expected_csv2, **pandas_options)) ) pandas_options = {"header": None, "dtype": str} dfs = tabula.read_pdf( pdf_path, multiple_tables=True, pandas_options=pandas_options.copy() ) self.assertEqual(len(dfs), 4) self.assertTrue(dfs[0].equals(pd.read_csv(expected_csv, **pandas_options))) dfs_template = tabula.read_pdf_with_template( pdf_path, template_path, stream=True, pages="all", pandas_options=pandas_options.copy(), ) self.assertEqual(len(dfs_template), 5) self.assertTrue( dfs_template[0].equals(pd.read_csv(template_expected_csv, **pandas_options)) )
def ConvertPdfMenuToTable(): #Read the PDF with a template in lattice mode df = tabula.read_pdf_with_template( input_path= r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu_kleber.pdf', template_path= r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu_kleber.tabula-template.json', lattice= True #, format = "CSV", output_path = (r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu.csv') ) table = df[0] #Only keep the five first rows table = table[:5] cols = [0, 2, 4, 6, 8] #Remove columns about alergens table.drop(table.columns[cols], axis=1, inplace=True) return table.values.tolist()
def getRegionIncidents(): soup = BeautifulSoup( requests.get( 'https://www.ssi.dk/aktuelt/sygdomsudbrud/coronavirus/covid-19-i-danmark-epidemiologisk-overvaagningsrapport' ).text, 'html.parser') links = soup.blockquote.find_all('a') link = links[0].get('href') head, sep, tail = link.partition('?') print(head) data = tabula.read_pdf_with_template(head, Path('template-1.json'), output_format='json') dict = {} for row in data[0]['data']: dict[row[0]['text']] = { 'confirmed_cases': row[1]['text'], 'population': row[2]['text'], 'cumulative_incidence': row[3]['text'] } for row in data[1]['data']: dict[row[0]['text']] = { 'confirmed_cases': row[1]['text'], 'population': row[2]['text'], 'cumulative_incidence': row[3]['text'] } # api = 'http://api.coronatracker.test/api/upload-region-incidents' api = 'https://api.coronatracker.dk/api/upload-region-incidents' r = requests.post(url=api, json=dict) print(r.status_code) print(r.text)
def load_pdf(date, path, lang="en"): path += "{0}-{1}.pdf".format(date, lang) if date <= "2020-05-27": template = select_template(date, APP_PATH) print(path, "\n", template) dfs = read_pdf_with_template(path, pandas_options={ "header": None, "dtype": str }, template_path=template) else: # fmt: off if date <= "2020-05-28": area = [304, 69, 674, 547] # Points: Top Y, Left X, Bottom Y, Right X columns = [ 160, 205, 254, 319, 370, 432, 476, ] # Points: X coordinates of column splits elif date <= "2020-06-01": area = [304, 69, 816, 547] # Points: Top Y, Left X, Bottom Y, Right X columns = [ 160, 205, 254, 319, 370, 432, 476, ] # Points: X coordinates of column splits else: area = [86, 69, 820, 547] # Points: Top Y, Left X, Bottom Y, Right X columns = [ 158, 205, 254, 319, 370, 432, 476, ] # Points: X coordinates of column splits dfs = read_pdf( path, pandas_options={ "header": None, "dtype": str }, stream=True, pages=3, area=area, columns=columns, ) # fmt: on print(dfs, "\n" * 2) df = dfs[0] if len(df.columns) == 2: print("Extracting 2 data columns") df.columns = ["land", "confirmed"] df = df.loc[:, ["land", "confirmed"]] elif len(df.columns) == 4: if date > "2020-03-24": print("Extracting 4 data columns after '2020-03-24'") df.columns = ["land", "confirmed", "daily", "dead"] df["per_mil"] = 0 df = df.loc[:, ["land", "confirmed", "daily", "per_mil", "dead"]] else: print("Extracting 4 data columns before '2020-03-24'") df.columns = [ "land", "confirmed", "electronically_submitted", "per_100k" ] df["per_mil"] = 0 df = df.loc[:, ["land", "confirmed"]] elif len(df.columns) == 5: print("Extracting 5 data columns") df.columns = ["land", "confirmed", "daily", "per_mil", "dead"] elif len(df.columns) == 6: print("Extracting 6 data columns") df.columns = [ "land", "confirmed", "daily", "per_mil", "dead", "dead_per_100k" ] elif len(df.columns) == 8: print("Extracting 6 data columns") df.columns = [ "land", "confirmed", "daily", "per_mil", "7day_sum", "7day_100k", "dead", "dead_per_100k", ] else: print(f"Falied to exctract {len(df.columns)} data columns") print(df.head(20)) raise Exception df = fix_misaligned_row(df) # Replace with a clause: if all columns except land contain numbers df["c"] = df.loc[:, "confirmed"].apply(count_numbers) df["d"] = df.loc[:, "dead"].apply(count_numbers) df = df.loc[(df.c != 0) & (df.d != 0), :] df.loc[:, ["confirmed"]] = df.loc[:, ["confirmed"]].apply(extract_number, axis=1) if "dead" in df.columns: df.loc[:, ["dead"]] = df.loc[:, ["dead"]].apply(extract_number, axis=1) else: df["dead"] = 0 df = df.loc[(df.land.str.contains("cases") == False) & (df.land.str.contains("Total") == False) & (df.land.str.contains("total") == False) & (df.land.str.contains("Gesamt") == False) & (df.land.str.contains("gesamt") == False), :, ] try: df["date"] = datetime.datetime.strptime(date, "%Y-%m-%d") except: df["date"] = date df = df.loc[:, ["land", "confirmed", "dead", "date"]] return df.reset_index(drop=True)
''' # 1. Imports & Sample Files import tabula from tabula import read_pdf pdf_file1 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Small_Table.pdf" pdf_file2 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Tesla_Bonds.pdf" pdf_file3 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_World_GDP.pdf" pdf_file4 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Cars.pdf" pdf_file5 = "https://databank.worldbank.org/data/download/GDP.pdf" # 2. Read PDFs tabula.read_pdf(pdf_file1, pages="all", stream=True)[0] # read all pages tabula.read_pdf(pdf_file2, pages=1, stream=True)[0] # read 1 page tabula.read_pdf(pdf_file4, pages="1-2", stream=True)[0] #specific pages tabula.convert_into(pdf_file1, "PDFTableExtraction_Output.csv", output_format="csv") # extract into file JSON, TSV, or CSV tabula.convert_into(pdf_file2, "PDFTableExtraction_Output.json", output_format="json") # 3 Read with Template template_path = "https://github.com/chezou/tabula-py/raw/master/tests/resources/data.tabula-template.json" tabula.read_pdf_with_template(pdf_file4, template_path)
def _parse_cheq_save(pdf_path, year, account_type): result = set() with pdfplumber.open(pdf_path) as pdf: template_path = f"{TEMPLATES_DIRECTORY}/{len(pdf.pages)}.json" text = "" for page in pdf.pages: text += page.extract_text(x_tolerance=1) opening_bal = _get_opening_bal(text, account_type) closing_bal = _get_closing_bal(text, account_type) dataframes = tabula.read_pdf_with_template(pdf_path, template_path) records = [] for df in dataframes: records.extend(df.where(pd.notnull(df), None).to_dict('records')) last_date = None add_seconds = 0 records = records[1:len(records)-1] # skip Opening/Closing for record in records: if 'Date Description' in record: parts = record['Date Description'].split(' ') try: if len(parts) > 2 and 0 <= int(parts[0]) <= 31: record['Date'] = ' '.join(parts[:2]) record['Description'] = ' '.join(parts[2:]) else: record['Date'] = None record['Description'] = record['Date Description'] except ValueError: record['Date'] = None record['Description'] = record['Date Description'] if 'Date' not in record: continue date_str = record['Date'] if date_str is None: date = last_date else: month = datetime.strptime(date_str.split(' ')[1], '%b').month if last_date is not None and month < last_date.month: year += 1 date = datetime.strptime(f"{date_str} {year}", '%d %b %Y') last_date = date if record['Withdrawals ($)'] is not None: amount = -float(str(record['Withdrawals ($)']).replace(',', '')) elif record['Deposits ($)'] is not None: amount = float(str(record['Deposits ($)']).replace(',', '')) else: continue description = record['Description'] transaction = Transaction(account_type, date, description, amount) transaction.date += timedelta(seconds=add_seconds) add_seconds += 1 result.add(transaction) _validate(opening_bal, closing_bal, result) return result
from bs4 import BeautifulSoup import requests import tabula import json url = "https://www.mass.gov/info-details/covid-19-cases-quarantine-and-monitoring#covid-19-cases-in-massachusetts-" rootURL = "https://www.mass.gov" response = requests.get(url) content = BeautifulSoup(response.content, "html.parser") template_path = "covid-19-case-report.tabula-template.json" for _, link in enumerate(content('a')): if link.has_attr('href'): if link.get('href').endswith('/download'): if link.string.endswith('2020'): r = requests.get(rootURL + link.get('href')) with open('COVID_data.pdf', 'wb') as outfile: outfile.write(r.content) parsedData = tabula.read_pdf_with_template('COVID_data.pdf', template_path, stream=True) for i, table in enumerate(parsedData): table.dropna(inplace=True) table.rename( columns={'Total Patients': 'Total Patients Positive'}, inplace=True) table.rename(columns={'Unnamed: 0': 'Cases'}, inplace=True) table.to_csv(f'{table.columns[0]}.csv', index=False)
def read_infos(self) -> List[Info]: 'read from a file' for name in dir(options.display): if 'max' in name: setattr(options.display, name, 1000) tables = read_pdf_with_template(self.name, TEMPLATE_PATH, pandas_options={'header': None}, pages=1, stream=True) table_periodo = tables[0] table_money = tables[1] table_details = tables[2] table_netto_pagare = tables[3] table_dati_fiscali = tables[4] table_ferie = tables[5] try: table_legenda_keys = tables[6] except IndexError: table_legenda_keys = DataFrame() try: table_legenda_values = tables[7] except IndexError: table_legenda_values = DataFrame() when = extract_periodo(table_periodo) minimo = extract_minimo(table_money) scatti = extract_scatti(table_money) superm = extract_superm(table_money) sup_ass = extract_sup_ass(table_money) edr = extract_edr(table_money) totale_retributivo = extract_totale_retributivo(table_money) netto_da_pagare = extract_netto_da_pagare(table_netto_pagare) ferie_a_prec = extract_ferie_a_prec(table_ferie) ferie_spett = extract_ferie_spett(table_ferie) ferie_godute = extract_ferie_godute(table_ferie) ferie_saldo = extract_ferie_saldo(table_ferie) par_a_prec = extract_par_a_prec(table_ferie) par_spett = extract_par_spett(table_ferie) par_godute = extract_par_godute(table_ferie) par_saldo = extract_par_saldo(table_ferie) legenda_ordinario = extract_legenda(table_legenda_keys, table_legenda_values, 'OR') legenda_straordinario = extract_legenda(table_legenda_keys, table_legenda_values, 'ST') legenda_ferie = extract_legenda(table_legenda_keys, table_legenda_values, 'FR') legenda_reperibilita = extract_legenda(table_legenda_keys, table_legenda_values, 'RA') legenda_rol = extract_legenda(table_legenda_keys, table_legenda_values, 'RL') additional_details = extract_details(table_details) info = Info(when=when, columns=[ Column(ColumnHeader.minimo, minimo), Column(ColumnHeader.scatti, scatti), Column(ColumnHeader.superm, superm), Column(ColumnHeader.sup_ass, sup_ass), Column(ColumnHeader.edr, edr), Column(ColumnHeader.totale_retributivo, totale_retributivo), Column(ColumnHeader.netto_da_pagare, netto_da_pagare), Column(ColumnHeader.ferie_a_prec, ferie_a_prec), Column(ColumnHeader.ferie_spett, ferie_spett), Column(ColumnHeader.ferie_godute, ferie_godute), Column(ColumnHeader.ferie_saldo, ferie_saldo), Column(ColumnHeader.par_a_prec, par_a_prec), Column(ColumnHeader.par_spett, par_spett), Column(ColumnHeader.par_godute, par_godute), Column(ColumnHeader.par_saldo, par_saldo), Column(ColumnHeader.legenda_ordinario, legenda_ordinario), Column(ColumnHeader.legenda_straordinario, legenda_straordinario), Column(ColumnHeader.legenda_ferie, legenda_ferie), Column(ColumnHeader.legenda_reperibilita, legenda_reperibilita), Column(ColumnHeader.legenda_rol, legenda_rol) ], additional_details=list(additional_details)) # there is only an info object in a pdf return [info]
from tabula import read_pdf from tabula import read_pdf_with_template from tabula import convert_into #df = read_pdf("Roll_Call_5.pdf", pages="all",stream=True) ''' convert_into("Sample_input_table_data/Roll_Call_5.pdf", "Sample_output_table_data/Roll_Call_5_nostream.csv", output_format="csv", pages='all',lattice = True,stream=False) ''' df = read_pdf_with_template("Sample_input_table_data/Roll_Call_5.pdf", "Sample_input_table_data/template_5.csv", pages='all', lattice=True) df.read() #1->2-5 #3->2-6