示例#1
0
    def _getPDFTable(request, context):
        """
        Mirrors the input and sends back the same data.
        :param request: iterable sequence of bundled rows
        :return: the same iterable sequence as received
        """
        # empty parameters
        extraction_method = None
        path = None
        template = None
        for request_rows in request:
            # pull duals from each row, and the numData from duals
            for row in request_rows.rows:
                if extraction_method is None:
                    extraction_method = [d.strData for d in row.duals][0]
                if path is None:
                    path = [d.strData for d in row.duals][1]
                if template is None:
                    template = [d.strData for d in row.duals][2]
        # read PDF with template
        if extraction_method == 'stream':
            df_list = read_pdf_with_template(path, template, stream=True)
        else:
            df_list = read_pdf_with_template(path, template, lattice=True)
        final_df = pd.DataFrame()
        count = 1
        for df in df_list:
            df['tableID'] = str(count)
            final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
            count = count + 1
        columns = final_df.columns

        # iterate through df columns and format as SSE duals
        dualsList = []
        for col in columns:
            tmpList = final_df[col].tolist()
            dualsList.append([
                SSE.Dual(strData=d) if type(d) is str else SSE.Dual(numData=d)
                for d in tmpList
            ])

        # create response rows
        response_rows = []
        for i in range(len(tmpList)):
            duals = [dualsList[z][i] for z in range(len(dualsList))]
            response_rows.append(SSE.Row(duals=iter(duals)))

        # return response
        yield SSE.BundledRows(rows=response_rows)
示例#2
0
def extract_tables(cls, fobj, page_rect: Iterable[PageRect]):
    data = [{
        "page": pr.page.number + 1,
        "extraction_method": "guess",
        "columns": ",".join(map(str, pr.columns)),
        "x1": pr.rect.x0,
        "x2": pr.rect.x1,
        "y1": pr.rect.y0,
        "y2": pr.rect.y1,
    } for pr in page_rect if pr is not None]
    template = io.StringIO(json.dumps(data))
    tables = map(cls.fix_table, tabula.read_pdf_with_template(fobj, template))

    current = None
    for page in page_rect:
        if page is None:
            # Break. Yield the concatenated so far.
            if current is not None:
                yield current
            current = None
        else:
            # Continuation. Concat to current.
            try:
                latest = next(tables)
                if current is None:
                    current = page.account, latest
                else:
                    current = page.account, current[1].append(
                        latest, ignore_index=True)
            except StopIteration:
                break

    if current is not None:
        yield current
示例#3
0
    def test_read_pdf_with_remote_template(self):
        template_path = ("https://github.com/chezou/tabula-py/raw/master/"
                         "tests/resources/data.tabula-template.json")

        dfs = tabula.read_pdf_with_template(self.pdf_path, template_path)
        self.assertEqual(len(dfs), 4)
        self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1)))
示例#4
0
    def test_read_pdf_with_template(self):
        pdf_path = 'tests/resources/data.pdf'
        template_path = 'tests/resources/data.tabula-template.json'
        expected_csv1 = 'tests/resources/data_1.csv'

        dfs = tabula.read_pdf_with_template(pdf_path, template_path)
        self.assertEqual(len(dfs), 4)
        self.assertTrue(dfs[0].equals(pd.read_csv(expected_csv1)))
示例#5
0
    def test_read_pdf_with_binary_template(self):
        template_path = "tests/resources/data.tabula-template.json"

        with open(self.pdf_path, "rb") as pdf:
            with open(template_path, "rb") as template:
                dfs = tabula.read_pdf_with_template(pdf, template)
        self.assertEqual(len(dfs), 4)
        self.assertTrue(dfs[0].equals(pd.read_csv(self.expected_csv1)))
示例#6
0
def to_csv(informeEPI,templateJSON):
    df = tabula.read_pdf_with_template(informeEPI,templateJSON,encoding='utf-8')
    comunas = dict()
    file_name = "informeEPI.csv"
    create_headers(file_name)
    for idx, dff in enumerate(df):
        key = 'tabla_' + str(idx + 1)
        comunas[key]=dff
        comunas[key].to_csv(file_name, mode='a', header=True, index=False, encoding="utf-8",float_format='%g')
示例#7
0
def run(region, pdf, template):
    logger.debug(
        f'Start reading for region {region} from {pdf} with template {template}'
    )
    tables = tabula.read_pdf_with_template(pdf, template, stream=True)
    mawaqit_for_wilayas = construct_mawaqit_for_wilayas(tables, region=region)
    for wilaya, mawaqit in mawaqit_for_wilayas.items():
        logger.debug(f'Checking dates for {wilaya}')
        check_dates(mawaqit.index)
    logger.debug(
        f'Exporting wilayas {list(mawaqit_for_wilayas.keys())} to {settings.mawaqit_for_wilayas_dir}'
    )

    export_mawaqit_for_wilayas(mawaqit_for_wilayas)
    logger.debug(f'Reading from {pdf} finished succesfully')
示例#8
0
def read_table_with_template(file,
                             template_file,
                             template_name,
                             fmt_func=None,
                             args=None):
    if args is None:
        args = {}
    template = get_template(template_file, template_name)
    with open('template.json', 'w') as json_file:
        json_file.write(json.dumps(template))
    table = tabula.read_pdf_with_template(file,
                                          'template.json',
                                          pandas_options={'header': None})[0]
    if fmt_func:
        table = fmt_func(table, **args)
    return table
示例#9
0
    def test_read_pdf_with_dtype_string(self):
        pdf_path = "tests/resources/data_dtype.pdf"
        expected_csv = "tests/resources/data_dtype_expected.csv"
        expected_csv2 = "tests/resources/data_2-3.csv"
        template_path = "tests/resources/data_dtype.tabula-template.json"
        template_expected_csv = "tests/resources/data_dtype_template_expected.csv"

        pandas_options = {"dtype": str}
        self.assertTrue(
            tabula.read_pdf(
                self.pdf_path,
                stream=True,
                pages=1,
                multiple_tables=False,
                pandas_options=pandas_options.copy(),
            )[0].equals(pd.read_csv(self.expected_csv1, **pandas_options))
        )
        self.assertTrue(
            tabula.read_pdf(
                self.pdf_path,
                pages="2-3",
                stream=True,
                guess=False,
                multiple_tables=False,
                pandas_options=pandas_options.copy(),
            )[0].equals(pd.read_csv(expected_csv2, **pandas_options))
        )

        pandas_options = {"header": None, "dtype": str}
        dfs = tabula.read_pdf(
            pdf_path, multiple_tables=True, pandas_options=pandas_options.copy()
        )
        self.assertEqual(len(dfs), 4)
        self.assertTrue(dfs[0].equals(pd.read_csv(expected_csv, **pandas_options)))

        dfs_template = tabula.read_pdf_with_template(
            pdf_path,
            template_path,
            stream=True,
            pages="all",
            pandas_options=pandas_options.copy(),
        )
        self.assertEqual(len(dfs_template), 5)
        self.assertTrue(
            dfs_template[0].equals(pd.read_csv(template_expected_csv, **pandas_options))
        )
def ConvertPdfMenuToTable():
    #Read the PDF with a template in lattice mode
    df = tabula.read_pdf_with_template(
        input_path=
        r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu_kleber.pdf',
        template_path=
        r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu_kleber.tabula-template.json',
        lattice=
        True  #, format = "CSV", output_path = (r'C:\Users\Maxence-Mathieu\Desktop\code\Kleber-menu\menu.csv')
    )
    table = df[0]

    #Only keep the five first rows
    table = table[:5]

    cols = [0, 2, 4, 6, 8]
    #Remove columns about alergens
    table.drop(table.columns[cols], axis=1, inplace=True)

    return table.values.tolist()
def getRegionIncidents():
    soup = BeautifulSoup(
        requests.get(
            'https://www.ssi.dk/aktuelt/sygdomsudbrud/coronavirus/covid-19-i-danmark-epidemiologisk-overvaagningsrapport'
        ).text, 'html.parser')
    links = soup.blockquote.find_all('a')
    link = links[0].get('href')

    head, sep, tail = link.partition('?')

    print(head)

    data = tabula.read_pdf_with_template(head,
                                         Path('template-1.json'),
                                         output_format='json')

    dict = {}

    for row in data[0]['data']:
        dict[row[0]['text']] = {
            'confirmed_cases': row[1]['text'],
            'population': row[2]['text'],
            'cumulative_incidence': row[3]['text']
        }

    for row in data[1]['data']:
        dict[row[0]['text']] = {
            'confirmed_cases': row[1]['text'],
            'population': row[2]['text'],
            'cumulative_incidence': row[3]['text']
        }

    # api = 'http://api.coronatracker.test/api/upload-region-incidents'
    api = 'https://api.coronatracker.dk/api/upload-region-incidents'

    r = requests.post(url=api, json=dict)

    print(r.status_code)
    print(r.text)
示例#12
0
def load_pdf(date, path, lang="en"):
    path += "{0}-{1}.pdf".format(date, lang)
    if date <= "2020-05-27":
        template = select_template(date, APP_PATH)
        print(path, "\n", template)
        dfs = read_pdf_with_template(path,
                                     pandas_options={
                                         "header": None,
                                         "dtype": str
                                     },
                                     template_path=template)
    else:
        # fmt: off
        if date <= "2020-05-28":
            area = [304, 69, 674,
                    547]  # Points: Top Y, Left X, Bottom Y, Right X
            columns = [
                160,
                205,
                254,
                319,
                370,
                432,
                476,
            ]  # Points: X coordinates of column splits

        elif date <= "2020-06-01":
            area = [304, 69, 816,
                    547]  # Points: Top Y, Left X, Bottom Y, Right X
            columns = [
                160,
                205,
                254,
                319,
                370,
                432,
                476,
            ]  # Points: X coordinates of column splits
        else:
            area = [86, 69, 820,
                    547]  # Points: Top Y, Left X, Bottom Y, Right X
            columns = [
                158,
                205,
                254,
                319,
                370,
                432,
                476,
            ]  # Points: X coordinates of column splits
        dfs = read_pdf(
            path,
            pandas_options={
                "header": None,
                "dtype": str
            },
            stream=True,
            pages=3,
            area=area,
            columns=columns,
        )
    # fmt: on
    print(dfs, "\n" * 2)
    df = dfs[0]
    if len(df.columns) == 2:
        print("Extracting 2 data columns")
        df.columns = ["land", "confirmed"]
        df = df.loc[:, ["land", "confirmed"]]
    elif len(df.columns) == 4:
        if date > "2020-03-24":
            print("Extracting 4 data columns after '2020-03-24'")
            df.columns = ["land", "confirmed", "daily", "dead"]
            df["per_mil"] = 0
            df = df.loc[:, ["land", "confirmed", "daily", "per_mil", "dead"]]
        else:
            print("Extracting 4 data columns before '2020-03-24'")
            df.columns = [
                "land", "confirmed", "electronically_submitted", "per_100k"
            ]
            df["per_mil"] = 0
            df = df.loc[:, ["land", "confirmed"]]
    elif len(df.columns) == 5:
        print("Extracting 5 data columns")
        df.columns = ["land", "confirmed", "daily", "per_mil", "dead"]
    elif len(df.columns) == 6:
        print("Extracting 6 data columns")
        df.columns = [
            "land", "confirmed", "daily", "per_mil", "dead", "dead_per_100k"
        ]
    elif len(df.columns) == 8:
        print("Extracting 6 data columns")
        df.columns = [
            "land",
            "confirmed",
            "daily",
            "per_mil",
            "7day_sum",
            "7day_100k",
            "dead",
            "dead_per_100k",
        ]
    else:
        print(f"Falied to exctract {len(df.columns)} data columns")
        print(df.head(20))
        raise Exception
    df = fix_misaligned_row(df)

    # Replace with a clause: if all columns except land contain numbers
    df["c"] = df.loc[:, "confirmed"].apply(count_numbers)
    df["d"] = df.loc[:, "dead"].apply(count_numbers)
    df = df.loc[(df.c != 0) & (df.d != 0), :]

    df.loc[:, ["confirmed"]] = df.loc[:, ["confirmed"]].apply(extract_number,
                                                              axis=1)
    if "dead" in df.columns:
        df.loc[:, ["dead"]] = df.loc[:, ["dead"]].apply(extract_number, axis=1)
    else:
        df["dead"] = 0
    df = df.loc[(df.land.str.contains("cases") == False)
                & (df.land.str.contains("Total") == False)
                & (df.land.str.contains("total") == False)
                & (df.land.str.contains("Gesamt") == False)
                & (df.land.str.contains("gesamt") == False), :, ]
    try:
        df["date"] = datetime.datetime.strptime(date, "%Y-%m-%d")
    except:
        df["date"] = date
    df = df.loc[:, ["land", "confirmed", "dead", "date"]]
    return df.reset_index(drop=True)
示例#13
0
'''

# 1. Imports & Sample Files
import tabula
from tabula import read_pdf

pdf_file1 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Small_Table.pdf"
pdf_file2 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Tesla_Bonds.pdf"
pdf_file3 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_World_GDP.pdf"
pdf_file4 = "https://github.com/tvelichkovt/PDF_Table_Extraction/raw/master/Sample_Files/PDFTableExtraction_Cars.pdf"
pdf_file5 = "https://databank.worldbank.org/data/download/GDP.pdf"

# 2. Read PDFs

tabula.read_pdf(pdf_file1, pages="all", stream=True)[0]  # read all pages
tabula.read_pdf(pdf_file2, pages=1, stream=True)[0]  # read 1 page
tabula.read_pdf(pdf_file4, pages="1-2", stream=True)[0]  #specific pages

tabula.convert_into(pdf_file1,
                    "PDFTableExtraction_Output.csv",
                    output_format="csv")  # extract into file JSON, TSV, or CSV
tabula.convert_into(pdf_file2,
                    "PDFTableExtraction_Output.json",
                    output_format="json")

# 3 Read with Template

template_path = "https://github.com/chezou/tabula-py/raw/master/tests/resources/data.tabula-template.json"
tabula.read_pdf_with_template(pdf_file4, template_path)
示例#14
0
def _parse_cheq_save(pdf_path, year, account_type):
    result = set()
    with pdfplumber.open(pdf_path) as pdf:
        template_path = f"{TEMPLATES_DIRECTORY}/{len(pdf.pages)}.json"
        text = ""
        for page in pdf.pages:
            text += page.extract_text(x_tolerance=1)
        opening_bal = _get_opening_bal(text, account_type)
        closing_bal = _get_closing_bal(text, account_type)
    dataframes = tabula.read_pdf_with_template(pdf_path, template_path)
    records = []
    for df in dataframes:
        records.extend(df.where(pd.notnull(df), None).to_dict('records'))
    last_date = None
    add_seconds = 0
    records = records[1:len(records)-1]  # skip Opening/Closing
    for record in records:
        if 'Date Description' in record:
            parts = record['Date Description'].split(' ')
            try:
                if len(parts) > 2 and 0 <= int(parts[0]) <= 31:
                    record['Date'] = ' '.join(parts[:2])
                    record['Description'] = ' '.join(parts[2:])
                else:
                    record['Date'] = None
                    record['Description'] = record['Date Description']
            except ValueError:
                record['Date'] = None
                record['Description'] = record['Date Description']

        if 'Date' not in record:
            continue
        date_str = record['Date']
        if date_str is None:
            date = last_date
        else:
            month = datetime.strptime(date_str.split(' ')[1], '%b').month
            if last_date is not None and month < last_date.month:
                year += 1
            date = datetime.strptime(f"{date_str} {year}", '%d %b %Y')
            last_date = date

        if record['Withdrawals ($)'] is not None:
            amount = -float(str(record['Withdrawals ($)']).replace(',', ''))
        elif record['Deposits ($)'] is not None:
            amount = float(str(record['Deposits ($)']).replace(',', ''))
        else: 
            continue
        description = record['Description']

        transaction = Transaction(account_type,
                                  date,
                                  description,
                                  amount)

        transaction.date += timedelta(seconds=add_seconds)
        add_seconds += 1

        result.add(transaction)

    _validate(opening_bal, closing_bal, result)

    return result
from bs4 import BeautifulSoup
import requests
import tabula
import json

url = "https://www.mass.gov/info-details/covid-19-cases-quarantine-and-monitoring#covid-19-cases-in-massachusetts-"
rootURL = "https://www.mass.gov"
response = requests.get(url)
content = BeautifulSoup(response.content, "html.parser")
template_path = "covid-19-case-report.tabula-template.json"
for _, link in enumerate(content('a')):
    if link.has_attr('href'):
        if link.get('href').endswith('/download'):
            if link.string.endswith('2020'):
                r = requests.get(rootURL + link.get('href'))
                with open('COVID_data.pdf', 'wb') as outfile:
                    outfile.write(r.content)
                parsedData = tabula.read_pdf_with_template('COVID_data.pdf',
                                                           template_path,
                                                           stream=True)
                for i, table in enumerate(parsedData):
                    table.dropna(inplace=True)
                    table.rename(
                        columns={'Total Patients': 'Total Patients Positive'},
                        inplace=True)
                    table.rename(columns={'Unnamed: 0': 'Cases'}, inplace=True)
                    table.to_csv(f'{table.columns[0]}.csv', index=False)
示例#16
0
    def read_infos(self) -> List[Info]:
        'read from a file'

        for name in dir(options.display):
            if 'max' in name:
                setattr(options.display, name, 1000)

        tables = read_pdf_with_template(self.name,
                                        TEMPLATE_PATH,
                                        pandas_options={'header': None},
                                        pages=1,
                                        stream=True)

        table_periodo = tables[0]
        table_money = tables[1]
        table_details = tables[2]
        table_netto_pagare = tables[3]
        table_dati_fiscali = tables[4]
        table_ferie = tables[5]
        try:
            table_legenda_keys = tables[6]
        except IndexError:
            table_legenda_keys = DataFrame()
        try:
            table_legenda_values = tables[7]
        except IndexError:
            table_legenda_values = DataFrame()

        when = extract_periodo(table_periodo)

        minimo = extract_minimo(table_money)
        scatti = extract_scatti(table_money)
        superm = extract_superm(table_money)
        sup_ass = extract_sup_ass(table_money)
        edr = extract_edr(table_money)
        totale_retributivo = extract_totale_retributivo(table_money)

        netto_da_pagare = extract_netto_da_pagare(table_netto_pagare)

        ferie_a_prec = extract_ferie_a_prec(table_ferie)
        ferie_spett = extract_ferie_spett(table_ferie)
        ferie_godute = extract_ferie_godute(table_ferie)
        ferie_saldo = extract_ferie_saldo(table_ferie)
        par_a_prec = extract_par_a_prec(table_ferie)
        par_spett = extract_par_spett(table_ferie)
        par_godute = extract_par_godute(table_ferie)
        par_saldo = extract_par_saldo(table_ferie)

        legenda_ordinario = extract_legenda(table_legenda_keys,
                                            table_legenda_values, 'OR')
        legenda_straordinario = extract_legenda(table_legenda_keys,
                                                table_legenda_values, 'ST')
        legenda_ferie = extract_legenda(table_legenda_keys,
                                        table_legenda_values, 'FR')
        legenda_reperibilita = extract_legenda(table_legenda_keys,
                                               table_legenda_values, 'RA')
        legenda_rol = extract_legenda(table_legenda_keys, table_legenda_values,
                                      'RL')

        additional_details = extract_details(table_details)

        info = Info(when=when,
                    columns=[
                        Column(ColumnHeader.minimo, minimo),
                        Column(ColumnHeader.scatti, scatti),
                        Column(ColumnHeader.superm, superm),
                        Column(ColumnHeader.sup_ass, sup_ass),
                        Column(ColumnHeader.edr, edr),
                        Column(ColumnHeader.totale_retributivo,
                               totale_retributivo),
                        Column(ColumnHeader.netto_da_pagare, netto_da_pagare),
                        Column(ColumnHeader.ferie_a_prec, ferie_a_prec),
                        Column(ColumnHeader.ferie_spett, ferie_spett),
                        Column(ColumnHeader.ferie_godute, ferie_godute),
                        Column(ColumnHeader.ferie_saldo, ferie_saldo),
                        Column(ColumnHeader.par_a_prec, par_a_prec),
                        Column(ColumnHeader.par_spett, par_spett),
                        Column(ColumnHeader.par_godute, par_godute),
                        Column(ColumnHeader.par_saldo, par_saldo),
                        Column(ColumnHeader.legenda_ordinario,
                               legenda_ordinario),
                        Column(ColumnHeader.legenda_straordinario,
                               legenda_straordinario),
                        Column(ColumnHeader.legenda_ferie, legenda_ferie),
                        Column(ColumnHeader.legenda_reperibilita,
                               legenda_reperibilita),
                        Column(ColumnHeader.legenda_rol, legenda_rol)
                    ],
                    additional_details=list(additional_details))

        # there is only an info object in a pdf
        return [info]
示例#17
0
from tabula import read_pdf
from tabula import read_pdf_with_template
from tabula import convert_into
#df = read_pdf("Roll_Call_5.pdf", pages="all",stream=True)
'''
convert_into("Sample_input_table_data/Roll_Call_5.pdf", "Sample_output_table_data/Roll_Call_5_nostream.csv", output_format="csv", pages='all',lattice = True,stream=False)
'''
df = read_pdf_with_template("Sample_input_table_data/Roll_Call_5.pdf",
                            "Sample_input_table_data/template_5.csv",
                            pages='all',
                            lattice=True)

df.read()
#1->2-5
#3->2-6