Exemplo n.º 1
0
 def test_read_pdf_with_option(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv1 = 'tests/resources/data_1.csv'
     expected_csv2 = 'tests/resources/data_2-3.csv'
     self.assertTrue(tabula.read_pdf(pdf_path, pages=1).equals(pd.read_csv(expected_csv1)))
     self.assertTrue(tabula.read_pdf(pdf_path, pages='2-3', guess=False).equals(pd.read_csv(expected_csv2)))
     self.assertTrue(tabula.read_pdf(pdf_path, pages=(2, 3), guess=False).equals(pd.read_csv(expected_csv2)))
def san_bernardino_sovc():
    sovc_url = 'http://www.sbcountyelections.com/Portals/9/Elections/2015/0317/SOV%20Book.pdf'
    primary = read_pdf(sovc_url, pages="1-6",
                       pandas_options={'error_bad_lines': False})
    first = primary.loc[((primary.index > 2) & (primary.index < 64))][
        ['Unnamed: 0', 'Unnamed: 5']].rename(columns={'Unnamed: 5': 'votes'})
    second = primary.loc[((primary.index > 67) & (primary.index < 129)) |
                         ((primary.index > 132) & (primary.index < 194)) |
                         ((primary.index > 197) & (primary.index < 243))][
        ['Unnamed: 0', 'Unnamed: 4']].rename(columns={'Unnamed: 4': 'votes'})

    primary = read_pdf(sovc_url, pages="3,5",
                       pandas_options={'error_bad_lines': False})
    third = primary.loc[primary.index > 2][
        ['Unnamed: 0', 'Unnamed: 5']].rename(columns={'Unnamed: 5': 'votes'})

    pcand = 'SHARON RUNNER'
    merged = first.append(second).append(third)
    merged['precinct'] = merged['Unnamed: 0'].apply(
        lambda x: x[len(pcand):][:7] if str(
            x).startswith(pcand) else str(x)[:7])

    return prepare_output(merged[['precinct', 'votes']],
                          'San Bernardino', 21,
                          ['Sharon Runner'])
Exemplo n.º 3
0
 def test_read_pdf_with_multiple_areas(self):
     # Original files are taken from https://github.com/tabulapdf/tabula-java/pull/213
     pdf_path = 'tests/resources/MultiColumn.pdf'
     expected_csv = 'tests/resources/MultiColumn.csv'
     expected_df = pd.read_csv(expected_csv)
     self.assertTrue(tabula.read_pdf(
         pdf_path, pages=1, area=[[0, 0, 100, 50], [0, 50, 100, 100]], relative_area=True).equals(expected_df))
     self.assertTrue(tabula.read_pdf(
         pdf_path, pages=1, area=[[0, 0, 451, 212], [0, 212, 451, 425]]).equals(expected_df))
Exemplo n.º 4
0
    def test_read_pdf_for_multiple_tables(self):
        pdf_path = 'tests/resources/data.pdf'
        expected_csv1 = 'tests/resources/data_1.csv'

        self.assertEqual(len(tabula.read_pdf(pdf_path, pages=2, multiple_tables=True)), 2)
        self.assertTrue(tabula.read_pdf(pdf_path, pages=1, multiple_tables=True)[0].equals(
            pd.read_csv(expected_csv1, header=None)))
        with self.assertRaises(pd.errors.ParserError):
            tabula.read_pdf(pdf_path, pages=2)
Exemplo n.º 5
0
    def read_pdf_page_by_page(pdf_file_path, extraction_path):
        error_pages = []

        # pages = 0
        # try:
        #     pages = PdfFileReader(open(pdf_file_path, 'rb')).getNumPages()
        # except:
        #     print("File can not be read. Please provide a correct file.")

        pages = 5
        for page in range(1, pages):
            try:
                # data = tabula.read_pdf(pdf_file_path, pages=page, silent=True)
                data = tabula.read_pdf(
                    pdf_file_path,
                    pages=page,
                    nospreadsheet=True,
                    area=(210, 20, 691, 573)
                )
            except:
                error_pages.append("|".join([str(page), str(pages)]))
                print("Reading error -", page)
                continue

            if not data.empty:
                out_path = os.path.join(extraction_path, str(page) + ".csv")
                data.to_csv(out_path, index=False)
                print("Page -", page, "(out of", pages, ")", data.shape, out_path)
            else:
                error_pages.append((page, pages))

        error_file = os.path.join(extraction_path, "errors.txt")
        with open(error_file, "w") as f:
            f.writelines(error_pages)
Exemplo n.º 6
0
 def test_read_pdf_into_json(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_json = 'tests/resources/data_1.json'
     json_data = tabula.read_pdf(pdf_path, output_format='json')
     self.assertTrue(isinstance(json_data, list))
     with open(expected_json) as json_file:
         data = json.load(json_file)
         self.assertEqual(json_data, data)
Exemplo n.º 7
0
 def read_pdf_whole(pdf_path, extraction_path):
     try:
         # data = tabula.read_pdf(pdf_path, pages="all", silent=True)
         data = tabula.read_pdf(pdf_path, pages=2, silent=False)
         out_file = pdf_dir = os.path.basename(pdf_file)[:-4]
         out_file_path = os.path.join(extraction_path, out_file + ".csv")
         data.to_csv(out_file_path, index=False)
         print("File-", out_file_path)
         return True
     except:
         print("Error reading the whole file. Switching to page-by-page mode...")
         return False
Exemplo n.º 8
0
 def test_read_pdf_with_pandas_option(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv1 = 'tests/resources/data_1.csv'
     column_name = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': None}
                                     ).equals(pd.read_csv(expected_csv1, header=None)))
     self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': 0}
                                     ).equals(pd.read_csv(expected_csv1, header=0)))
     self.assertTrue(tabula.read_pdf(pdf_path, pages=1, pandas_options={'header': 'infer'}
                                     ).equals(pd.read_csv(expected_csv1, header='infer')))
     self.assertTrue(
         tabula.read_pdf(
             pdf_path, pages=1, pandas_options={'header': 'infer', 'names': column_name}
         ).equals(pd.read_csv(expected_csv1, header='infer', names=column_name))
     )
     self.assertTrue(
         tabula.read_pdf(
             pdf_path, pages=1, multiple_tables=True,
             pandas_options={'header': 'infer', 'names': column_name}
         )[0].equals(pd.read_csv(expected_csv1, header='infer', names=column_name))
     )
     self.assertTrue(
         tabula.read_pdf(
             pdf_path, pages=1, multiple_tables=True,
             pandas_options={'header': 'infer', 'columns': column_name}
         )[0].equals(pd.read_csv(expected_csv1, header='infer', names=column_name))
     )
Exemplo n.º 9
0
def parse_pdf(path, cuatri, indice, out):
    cols = ["Materia", "Curso", "Nombre", "Docentes", "Día", "Inicio", "Fin", "Sede", "Vacantes"]

    dfr = read_pdf(path,
                   stream=True,
                   guess=False,
                   pages='all',
                   area=[[82, 14.39, 795.38, 581.8], [42.93, 14.39, 796.43, 581.8]],
                   columns=[14.39, 46.94, 68.05, 197.33, 451.53, 475.29, 499.52, 524.23, 556.55])
    dfr = dfr.drop([col for col in dfr.columns if "Unnamed" in col], axis=1)
    dfr.columns = cols

    DIAS = {
        'L': 'Lunes',
        'Ma': 'Martes',
        'Mi': 'Miércoles',
        'J': 'Jueves',
        'V': 'Viernes',
        'S': 'Sábados'
    }
    dfr['Día'] = dfr['Día'].apply(lambda x: str(x).strip()).apply(DIAS.get)

    dfr = dfr.drop(dfr[~dfr.Materia.str.isdigit()].index)

    cols_horarios = ["Día", "Inicio", "Fin", "Sede"]
    dfr['Horarios'] = dfr[cols_horarios].apply(lambda x: ' '.join(map(str, x)), axis=1)
    dfr = dfr.drop(cols_horarios, axis=1)

    def tjoin(x, s):
        try:
            return s.join(set(x))
        except:
            return 'Sin especificar'

    dfr = dfr.groupby(["Materia", "Curso"]).agg({
        'Nombre': lambda x: list(x)[0],
        'Docentes': partial(tjoin, s='-'),
        'Vacantes': lambda x: list(x)[0],
        'Horarios': partial(tjoin, s="\n"),
    }).reset_index()
    dfr['Cuatri'] = cuatri
    dfr['Indice'] = dfr.index + indice

    dfr = dfr[['Indice', 'Cuatri', 'Nombre', 'Materia', 'Horarios']]

    dfr.to_csv(out, index=False)
Exemplo n.º 10
0
def read_pdf(pdf_file_path):
    pdf_dir = pdf_file_path[-10:-7]
    pdf_dir_path = os.path.join(output_dir, pdf_dir)

    try:
        os.mkdir(pdf_dir_path)
    except OSError:
        pass

    # try:
    #     data = tabula.read_pdf(pdf_file_path, pages="all", silent=True)
    #     out_path = os.path.join(pdf_dir_path, pdf_dir+".csv")
    #     data.to_csv(out_path, index=False)
    #     print("File-", pdf_file_path)
    # except:
    #     print("Error reading the whole file. Switching to page-by-page mode...")
    #     pass

    error_pages = []
    pages = PdfFileReader(open(pdf_file_path, 'rb')).getNumPages()
    for page in range(pages, pages + 1):
        try:
            data = tabula.read_pdf(pdf_file_path, pages=page, silent=True)
        except:
            error_pages.append("%s|%s" % (page, pages))
            print("Reading error -", page)
            continue

        if not (data is None or data.empty):
            out_path = os.path.join(output_dir, pdf_dir, str(page) + ".csv")
            data.to_csv(out_path, index=False)
            print("Page -", page, "(out of", pages, ")", data.shape, out_path)
        else:
            error_pages.append("%s|%s" % (page, pages))

    error_file = os.path.join(pdf_dir_path, "errors.txt")
    with open(error_file, "w") as f:
        f.writelines(error_pages)
Exemplo n.º 11
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tabula import read_pdf

# after this: pip install tabula-py
# and install homebrew java (NOT default java)

for year in range(2011, 2019):
    for table in range(21, 30):
        try:
            fname = 'pdfs/table_%d_%d.pdf' % (table, year)
            df = read_pdf(fname)
            df.to_csv('csvs/table_%d_%d.csv' % (table, year))
        except:
            pass
Exemplo n.º 12
0
    "income", "profit", "income tax", "expenses", "operat", "year ended", "$M"
]
#assets, current assets, balance sheet, cashflow statement
pdf_document = fitz.open(financial_report)
pages_wterm = []

print(len(pdf_document))

iffound = False
for current_page in range(len(pdf_document)):
    page = pdf_document.loadPage(current_page)
    for index in range(len(search_term)):
        if page.searchFor(search_term[index]):
            iffound = True
        else:
            iffound = False
            break
    if (iffound):
        pages_wterm.append(current_page + 1)
        iffound = False
print(pages_wterm)

# Read pdf int list of DataFrame
df = tabula.read_pdf(financial_report, pages=pages_wterm)

# convert PDF into CSV file
tabula.convert_into(financial_report,
                    "output.csv",
                    output_format="csv",
                    pages=pages_wterm)
Exemplo n.º 13
0
#lkoval
#6-19-19

from tabula import read_pdf
import pandas as pd
import string

#Table 1.5
temp = read_pdf("document_1374394.pdf",
                pages="22,23",
                lattice=True,
                pandas_options={'header': None})
temp["raw_chem_name"] = temp.iloc[:, 1]
temp = temp.drop([0, 15])
temp = temp.dropna(subset=["raw_chem_name"])
temp = temp.reset_index()
temp = temp[["raw_chem_name"]]

chemList = []
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
for j in range(0, len(temp)):
    temp["raw_chem_name"].iloc[j] = str(
        temp["raw_chem_name"].iloc[j]).strip().lower().replace(
            ".", "").replace("α", "alpha")
    temp["raw_chem_name"].iloc[j] = clean(str(temp["raw_chem_name"].iloc[j]))
    for k in range(0, len(temp["raw_chem_name"].iloc[j].split(","))):
        chemList.append(temp["raw_chem_name"].iloc[j].split(",")[k])

table_1_5 = pd.DataFrame()
table_1_5["raw_chem_name"] = ""
table_1_5["raw_chem_name"] = chemList
Exemplo n.º 14
0
 def test_read_remote_pdf(self):
     uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf"
     df = tabula.read_pdf(uri)
     self.assertTrue(isinstance(df, pd.DataFrame))
Exemplo n.º 15
0
 def test_read_pdf_exception(self):
     invalid_pdf_path = 'notexist.pdf'
     with self.assertRaises(FileNotFoundError):
         tabula.read_pdf(invalid_pdf_path)
Exemplo n.º 16
0
def readpdf(file):
    convert_into(file, file.replace(".pdf", ".json"), output_format="json")
    pdf = read_pdf(file)
    write_to_file(pdf)
Exemplo n.º 17
0
'''
 S = single family, 
 T_C = townhouse and condo
 %Chg = year to year % change
 Sales = Closed Sales
'''
column_list = ['Area', 'Single Family Sales', 'Single Family Sales Y2Y %Chg', 'Single Family Median Price', 
               'Single Family Median Price Y2Y %Chg', r'Townhouse/Condo Sales', r'Townhouse/Condo Sales Y2Y %Chg', 
               r'Townhouse/Condo Median Price', r'Townhouse/Condo Median Price Y2Y %Chg']

for index, pdf_filename in enumerate(all_pdf):
    if pdf_filename[1] != 'Q' and pdf_filename.find('MSA') >= 0 and pdf_filename.find('End') < 0:
        # This is monthly MSA stats
        print('%d - %s' % (index, pdf_filename))
        
        t1 = tabula.read_pdf(os.path.join(download_folder, pdf_filename), lattice = True)
        t2 = t1.iloc[1:24, [1, 3, 4, 6, 7, 9, 10, 12, 13]]
        t2.columns = column_list
        
        temp = pdf_filename.split('-')
        t2['Month'] = temp[1] + '-' + month_mapping[temp[0]]
        
        df_list.append(t2)

# Area column may contain noise
monthly_MSA_df = pd.concat(df_list, axis = 0)
monthly_MSA_df.iloc[:, 1] = monthly_MSA_df.iloc[:, 1].str.replace(',', '').astype('float')
monthly_MSA_df.iloc[:, 2] = monthly_MSA_df.iloc[:, 2].str.replace('%', '').astype('float')
monthly_MSA_df.iloc[:, 3] = monthly_MSA_df.iloc[:, 3].str.replace(',', '').str.replace('$', '').astype('float')
monthly_MSA_df.iloc[:, 4] = monthly_MSA_df.iloc[:, 4].str.replace('%', '').astype('float')
monthly_MSA_df.iloc[:, 5] = monthly_MSA_df.iloc[:, 5].str.replace(',', '').astype('float')
Exemplo n.º 18
0
 def test_read_pdf_file_like_obj(self):
     with open(self.pdf_path, "rb") as f:
         df = tabula.read_pdf(f, stream=True)
         self.assertTrue(len(df), 1)
         self.assertTrue(isinstance(df[0], pd.DataFrame))
         self.assertTrue(df[0].equals(pd.read_csv(self.expected_csv1)))
Exemplo n.º 19
0
 def test_read_remote_pdf_with_custom_user_agent(self):
     df = tabula.read_pdf(self.uri, user_agent="Mozilla/5.0", stream=True)
     self.assertTrue(len(df), 1)
     self.assertTrue(isinstance(df[0], pd.DataFrame))
Exemplo n.º 20
0
 def test_read_remote_pdf(self):
     df = tabula.read_pdf(self.uri)
     self.assertTrue(len(df), 1)
     self.assertTrue(isinstance(df[0], pd.DataFrame))
Exemplo n.º 21
0
 def test_read_pdf(self):
     df = tabula.read_pdf(self.pdf_path, stream=True)
     self.assertTrue(len(df), 1)
     self.assertTrue(isinstance(df[0], pd.DataFrame))
     self.assertTrue(df[0].equals(pd.read_csv(self.expected_csv1)))
Exemplo n.º 22
0
 def test_read_pdf_with_java_option(self):
     self.assertTrue(
         tabula.read_pdf(
             self.pdf_path, pages=1, stream=True, java_options=["-Xmx256m"]
         )[0].equals(pd.read_csv(self.expected_csv1))
     )
Exemplo n.º 23
0
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 21 13:17:36 2020

@author: hmusugu
"""
import tabula
import pandas as pd

df1 = tabula.read_pdf('ethylene-prices.pdf', pages=1, lattice=True)
df2 = tabula.read_pdf('ethylene-prices.pdf', pages=2, lattice=True)
df3 = tabula.read_pdf('ethylene-prices.pdf', pages=3, lattice=True)
df4 = tabula.read_pdf('ethylene-prices.pdf', pages=4, lattice=True)
df5 = tabula.read_pdf('ethylene-prices.pdf', pages=5, lattice=True)
df6 = tabula.read_pdf('ethylene-prices.pdf', pages=6, lattice=True)

list1 = (df1[0], df2[0], df3[0], df4[0], df5[0], df6[0])

final = pd.DataFrame()
final = pd.concat(list1)
Exemplo n.º 24
0
 def test_read_pdf_exception(self):
     invalid_pdf_path = 'notexist.pdf'
     with self.assertRaises(subprocess.CalledProcessError):
         tabula.read_pdf(invalid_pdf_path)
Exemplo n.º 25
0
def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')

    messages = []
    # Download the most recently updated PDF file
    for change in event:
        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
        with open(tmp.name, 'wb') as fp:
            s3.download_fileobj(secret['bucketname'],change['keyname'],fp)
        # Get the date range covered by the report
        text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
        regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})')
        start_date = None
        end_date = None
        for line in text.split('\n'):
            m = regex.search(line)
            if m:
                start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date()
                end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date()
                break
        if start_date is None:
            logging.error('Unable to find start date in report')
            return {
                "statusCode": 404,
                "body": 'Unable to find start date in report %s' %change['url'],
            }
        # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to
        # tables that will not parse properly in the PDF
        tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True)
        tablecount = 0
        dataset = pandas.DataFrame()
        for df in tables:
            if 'Total' not in df.columns:
                firstrow = df.iloc[0]
                newcols = []
                for i in range(len(firstrow)):
                    if isinstance(firstrow[i], float) and math.isnan(firstrow[i]):
                        newcols.append(df.columns[i])
                    else:
                        newcols.append(firstrow[i])
                df.columns = newcols
                df = df[1:]
            df['Setting'] = df['Setting'].str.strip()
            df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True)
            df['Total'] = df['Total'].astype(int)
            df['Open'] = df['Open'].astype(int)
            df['Closed'] = df['Closed'].astype(int)
            df = df[df['Setting']!='Total']
            if tablecount==0:
                df['Type'] = 'Probable Outbreak'
            elif tablecount==1:
                df['Type'] = 'Cluster'
            else:
                logging.warning('Unexpected table: %s' %df)
            tablecount += 1
            dataset = pandas.concat([dataset, df])
        dataset['Start Date'] = pandas.to_datetime(start_date)
        dataset['End Date'] = pandas.to_datetime(end_date)
        week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7)
        dataset['Week'] = week
        # Create a simple summary and the tweet text
        summary = dataset.groupby('Type').sum()
        tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
        for Type,data in summary.to_dict('index').items():
            tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed'])
        tweet += '\n%s' %change['url']
        # Pull current data from s3
        try:
            obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body']
        except s3.exceptions.NoSuchKey:
            print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname']))
            datastore = pandas.DataFrame(columns=['Week'])
        else:
            stream = io.BytesIO(obj.read())
            datastore = pandas.read_csv(stream)
        # Clean out any data with matching dates
        datastore = datastore[datastore['Week'] != week]
        # Append the new data
        datastore = pandas.concat([datastore, dataset])
        datastore['Start Date'] = pandas.to_datetime(datastore['Start Date'])
        datastore['End Date'] = pandas.to_datetime(datastore['End Date'])
        # Replace any known duplicates
        datastore['Setting'] = datastore['Setting'].replace({
            'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue',
            'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue',
            'Funeral / Wakes': 'Funeral / Wake',
            'Restaurant / Cafe': 'Restaurant / Café'
        })
        # Push the data to s3
        stream = io.BytesIO()
        datastore.to_csv(stream, index=False)
        stream.seek(0)
        s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore'])
        # Set up chromedriver so we can save altair plots
        driver = get_chrome_driver()
        plots = []
        if driver is None:
            logging.error('Failed to start chrome')
        else:
            p = altair.vconcat(
                altair.Chart(
                    dataset
                ).mark_bar().encode(
                    x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')),
                    y = altair.Y('Setting:O'),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Covers the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")),
                    color='Type',
                    facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0),
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=90,
                    width=160,
                    title=altair.TitleParams(
                        'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')),
                        anchor='middle',
                    ),
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})

        # Convert to dates to ensure correct output to CSV
        datastore['Start Date'] = datastore['Start Date'].dt.date
        datastore['End Date'] = datastore['End Date'].dt.date

        # Tweet out the text and images
        if change.get('notweet') is not True:
            api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
            upload_ids = api.upload_multiple(plots)
            if change.get('testtweet') is True:
                if len(upload_ids) > 0:
                    resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    if len(upload_ids) > 1:
                        resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1])
                        if len(upload_ids) > 2:
                            resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2])
                else:
                    resp = api.dm(secret['twitter_dmaccount'], tweet)
                messages.append('Tweeted DM ID %s' %(resp.id))
            else:
                if len(upload_ids) > 0:
                    resp = api.tweet(tweet, media_ids=upload_ids)
                else:
                    resp = api.tweet(tweet)
                # Download and update the index
                status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index'])
                index = status.get_dict()
                for i in range(len(index)):
                    if index[i]['filedate'] == change['filedate']:
                        index[i]['tweet'] = resp.id
                        break
                status.put_dict(index)
                messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index']))
        else:
            print(tweet)
            messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }
Exemplo n.º 26
0
 def test_read_pdf(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv1 = 'tests/resources/data_1.csv'
     df = tabula.read_pdf(pdf_path)
     self.assertTrue(isinstance(df, pd.DataFrame))
     self.assertTrue(df.equals(pd.read_csv(expected_csv1)))
Exemplo n.º 27
0
def clean_page_single_file(file_path: str) -> pd.DataFrame:
    state_abbr_dict = {
        'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'District_of_Columbia': 'DC',
        'Delaware': 'DE',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New_Hampshire': 'NH',
        'New_Jersey': 'NJ',
        'New_Mexico': 'NM',
        'New_York': 'NY',
        'North_Carolina': 'NC',
        'North_Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode_Island': 'RI',
        'South_Carolina': 'SC',
        'South_Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West_Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'
    }
    state_name = file_path.split('/')[-1].split('.')[0]
    state_code = state_abbr_dict[state_name]

    df = tabula.read_pdf(file_path, pages='all')
    p1 = df[0]
    p2 = df[1]

    p1.columns = list('abcde')
    p1 = p1.iloc[2:]
    p1['a'] = p1['b']
    p1 = p1[['a', 'c']]
    p1 = p1.iloc[:19]

    p1 = p1.transpose()
    new_header = p1.iloc[0]
    p1 = p1.iloc[1:]
    p1.columns = new_header
    p1.insert(0, 'State', [state_code])
    p1 = p1.reset_index()
    del p1['index']
    p1.columns.name = None
    p1 = p1.replace('---', 'NA')

    p2_columns = [
        'Speciality', 'Physicians', 'People per Physicians',
        'Total female Physicians', 'Percent Female Physicians',
        'Total Physicians > Age 60', 'Percent Physicians > Age 60'
    ]
    p2.columns = p2_columns
    p2 = p2.replace('*', 'NA')
    p2 = p2[['Speciality', 'Physicians']]
    p2 = p2.transpose()

    new_header = p2.iloc[0]
    p2 = p2.iloc[1:]
    p2.columns = new_header
    del p2['SpecialtyPhysiciansPeople Per PhysicianNumberPercentNumberPercent']
    p2.insert(0, 'State', [state_code])
    p2 = p2.reset_index()
    del p2['index']
    p2.columns.name = None

    p1 = p1.join(p2, rsuffix='_p2')
    del p1['State_p2']

    return (p1)
Exemplo n.º 28
0
 def test_read_remote_pdf(self):
     uri = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/12s0324.pdf"
     df = tabula.read_pdf(uri)
     self.assertTrue(isinstance(df, pd.DataFrame))
Exemplo n.º 29
0
def upload(request):
    import yaml
    """upload page"""
    now = timezone.now()
    output = ""
    status_output = ""

    # Query
    qs = ExpiredProduct.objects.all()
    product_contains = request.GET.get('product')
    pick_date_min = request.GET.get('pick_date_min')
    qs = qs.filter(expired_date__lt=now)
    if is_valid_queryparam(product_contains):
        qs = qs.filter(product__icontains=product_contains)
    if is_valid_queryparam(pick_date_min):
        qs = qs.filter(expired_date__gte=pick_date_min)
    expired_prod = set(qs.values_list('product', flat=True))

    # Check List
    if request.method == 'POST' and "document" in request.FILES:
        if expired_prod:
            dt_str = now.strftime("%d%m%Y_%H%M%S")

            # Read PDF
            myfile = request.FILES['document']
            fs = FileSystemStorage()
            path_file = os.path.join(
                os.path.dirname(__file__),
                "static/upload_files/temp_%s.pdf" % dt_str)
            filename = fs.save(path_file, myfile)

            # Read PDF Setting
            setting_file = os.path.join(os.path.dirname(__file__),
                                        "setting/pdf_settings.yaml")
            pdf_setting = open(setting_file)
            parsed_yaml_file = yaml.load(pdf_setting, Loader=yaml.FullLoader)
            df = tabula.read_pdf(filename,
                                 encoding='utf-8',
                                 pages="all",
                                 area=parsed_yaml_file["detext_position"])

            # Read Column from pdf
            curr_prod = []
            for page in range(len(df)):
                if "型 式" in df[page]:
                    try:
                        curr_prod = curr_prod + list(df[page]["Unnamed: 0"])
                    except:
                        curr_prod = curr_prod + list(df[page]["型 式"])
                    print("Column data (page %d): " % (page + 1),
                          curr_prod)  # to check data in column

            # Combine Rule
            output = []
            expired_prod = ["^" + s + "$" for s in list(expired_prod)]
            combined = "(" + ")|(".join(expired_prod) + ")"

            # Matching
            for prod in set(curr_prod):
                if re.match(combined, str(prod)):
                    output.append(prod)
            os.remove(filename)
            if output:
                output = " 結果 (%s %d): %s" % (str(myfile), len(output), output)
            else:
                output = " 結果 (%s): 問題がありません" % str(myfile)
                status_output = "pass"
        else:
            output = "pass"
            status_output = "pass"

    # UPLOAD XLSX TO DB
    if request.method == 'POST' and "document-db" in request.FILES:
        dt_str = now.strftime("%d%m%Y_%H%M%S")
        count = 0

        # Read XLSX file
        myfile = request.FILES['document-db']
        fs = FileSystemStorage()
        path_file = os.path.join(os.path.dirname(__file__),
                                 "static/upload_files/temp_%s.pdf" % dt_str)
        filename = fs.save(path_file, myfile)
        df_expired = pd.read_excel(filename, index_col=0)
        expired_prod_file = set(df_expired["Unnamed: 4"])

        # Cleaning Data
        for origin_item in expired_prod_file:
            if origin_item and origin_item != "品番" and type(
                    origin_item) == str:
                item = origin_item.replace(" ", "")
                for sub_item in item.split(","):
                    sub_item = sub_item.replace("**/**", "*")
                    for i in range(6, 1, -1):
                        sub_item = sub_item.replace("*" * i, "*")
                    sub_item = sub_item.replace('*', '.*')
                    if sub_item not in expired_prod:
                        count += 1
                        expired_date = list(
                            df_expired[df_expired["Unnamed: 4"] ==
                                       origin_item]["Unnamed: 7"])[0]
                        expired_date = convert_datetime(expired_date)
                        ExpiredProduct.objects.get_or_create(
                            product=sub_item, expired_date=expired_date)
                    else:
                        pass
            else:
                pass
        os.remove(filename)

    context = {
        'all': qs,
        'request': request,
        'output': output,
        'status': status_output
    }

    return render(request, 'upload.html', context)
Exemplo n.º 30
0
import pandas as pd 
import numpy as np 
from PIL import Image
import cv2
import pytesseract
from matplotlib import pyplot as plt

pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract'
TESSDATA_PREFIX = 'C:/Program Files/Tesseract-OCR'

statement = pytesseract.image_to_string(Image.open('statement2.png'),lang='eng',config='-c preserve_interword_spaces=1x1 --psm 1 --oem 3')
df = pytesseract.image_to_data(Image.open('statement2.png'),lang='eng', output_type='data.frame')


from tabula import read_pdf
df = read_pdf('statement_sample1.pdf')

print(df)
print(statement)

statement




import re



s = statement
start = s.find('\n\n \n\n \n\n') + 8
Exemplo n.º 31
0
    def parse(self, response):


        df = tabula.read_pdf('/home/ait-python/Desktop/583/1.pdf',
    
        pages = '1',delimeter=',',
        encoding='ISO-8859-1',
        area=(146.498,30.0,749.318,579.27),
        guess=False,
        pandas_options={'header': 'infer'})
        for _, row in df.iterrows():
            self.a=row.tolist()


            lic_no = str(self.a[0])

            print("###################",lic_no)
            fname = str(self.a[2])+" "+str(self.a[3])+" "+str(self.a[4])
            lname = fname.replace("nan","")
            lic_type = "Funeral Director License"
            daob = str(self.a[6])
            daoe = str(self.a[7])



            il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
            il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
            il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
            il.add_value('sourceName', 'NV_Funeral_Licenses')
            il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
            il.add_value('permit_lic_no', lic_no)
            il.add_value('permit_lic_eff_date', daob)
            il.add_value('permit_subtype', lic_type)
            il.add_value('person_name', fname)
            il.add_value('location_address_string', 'NV')
            il.add_value('company_phone', '')
            il.add_value('permit_type', 'cemetery_funeral_license')
            il.add_value('permit_lic_exp_date', daoe)
            il.add_value('company_name', '')
            yield il.load_item()

        df2 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Embalmers.pdf',
    
        pages = '2',delimeter=',',
        encoding='ISO-8859-1',
        area=(70.763,30.0,535.883,580.035),
        guess=False,
        pandas_options={'header': 'infer'})
        for _, row in df2.iterrows():
            self.b=row.tolist()
        
            lic_no = str(self.b[0])+str(self.b[1]).replace('nan','')
            fname = str(self.b[2])+" "+str(self.b[3])+" "+str(self.b[4]).replace('nan','')
            lname = fname.replace('nan','')
            lic_type = "Embalmer License"
            daob = str(self.b[6])
            daoe = str(self.b[7])


            il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
            il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
            il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
            il.add_value('sourceName', 'NV_Funeral_Licenses')
            il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
            il.add_value('permit_lic_no', lic_no)
            il.add_value('permit_lic_eff_date', daob)
            il.add_value('permit_subtype', lic_type)
            il.add_value('person_name', lname)
            il.add_value('location_address_string', 'NV')
            il.add_value('company_phone', '')
            il.add_value('permit_type', 'cemetery_funeral_license')
            il.add_value('permit_lic_exp_date', daoe)
            il.add_value('company_name', '')
            yield il.load_item()


# #----------------------------------------------------------------------------------2
#         df3 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf',
#         pages = '1',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(155.678,29.835,752.378,582.93),

#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df3.iterrows():

#             self.c=row.tolist()

#             lic_no = str(self.c[0]).replace('nan','')
#             fname = str(self.c[1])+" "+str(self.c[2])+" "+str(self.c[3])
#             lname = fname.replace('nan','')
#             lic_type = "Funeral Arranger License"
#             daob = str(self.c[5]).replace('nan','')
#             daoe = str(self.c[6]).replace('nan','')

#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date', daob)
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', lname)
#             il.add_value('location_address_string', 'NV')
#             il.add_value('company_phone', '')
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', daoe)
#             il.add_value('company_name', '')
#             yield il.load_item()



#         df4 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf',
#         pages = '2',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(60.818,30.0,767.678,583.86),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df4.iterrows():

#             self.d=row.tolist()

#             lic_no = str(self.d[0]).replace('nan','')
#             fname = str(self.d[1])+" "+ str(self.d[2])+" "+str(self.d[3])
#             lname = fname.replace('nan','')
#             lic_type = "Funeral Arranger License"
#             daob = str(self.d[5])
#             daoe = str(self.d[6]).replace('nan','')
            
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date', daob)
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', lname)
#             il.add_value('location_address_string', 'NV')
#             il.add_value('company_phone', '')
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', daoe)
#             il.add_value('company_name', '')
#             yield il.load_item()



#         df5 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf',
#         pages = '3',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(60.818,30.0,393.593,580.035),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df5.iterrows():
#             self.e = row.tolist()

#             lic_no = str(self.e[0]).replace('nan','')
#             fname = str(self.e[1])+" "+ str(self.e[2])+" "+str(self.e[3])
#             lname = fname.replace('nan','')
#             lic_type = "Funeral Arranger License"
#             daob = str(self.e[5])
#             daoe = str(self.e[6])
        
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date', daob)
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', lname)
#             il.add_value('location_address_string', 'NV')
#             il.add_value('company_phone', '')
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', daoe)
#             il.add_value('company_name', '')
#             yield il.load_item()
    





# # #-----------------------------------------------------------------------------------------------------------------------------------3
#         df6 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf',
#         pages = '1',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(146.498,30.0,763.853,580.035),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df6.iterrows():
#             self.f = row.tolist()

#             lic_no = str(self.f[0]).replace('nan','')
#             fname = str(self.f[1])+" "+ str(self.f[2])+" "+str(self.f[3])
#             lname = fname.replace('nan','')
#             lic_type = "Funeral Director License"
#             daob = str(self.f[5])
#             daoe = str(self.f[6])
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date', daob)
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', lname)
#             il.add_value('location_address_string', 'NV')
#             il.add_value('company_phone', '')
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', daoe)
#             il.add_value('company_name', '')
#             yield il.load_item()

        
#         df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf',
#         pages = '2,3,4',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(60.818,30.0,751.613,580.035),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df7.iterrows():
#             self.g = row.tolist()

#             lic_no = str(self.g[0]).replace('nan','')
#             fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3])
#             lname = fname.replace('nan','')
#             lic_type = "Funeral Director License"
#             daob = str(self.g[5])
#             daoe = str(self.g[6])
#             # print("^^^^^^^^^^^^^^^^^",daoe)
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date', daob)
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', lname)
#             il.add_value('location_address_string', 'NV')
#             il.add_value('company_phone', '')
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', daoe)
#             il.add_value('company_name', '')
#             yield il.load_item()


# #-----------------------------------------------------


#         # df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf',
#         # pages = '3',
#         # encoding='ISO-8859-1',
#         # area=(60.818,30.0,751.613,580.035),
#         # guess=False,
#         # pandas_options={'header': 'infer'})
#         # for _, row in df7.iterrows():
#         #     self.g = row.tolist()

#         #     lic_no = str(self.g[0]).replace('nan','')
#         #     fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3])
#         #     lname = fname.replace('nan','')
#         #     lic_type = "Funeral Director License"
#         #     daob = str(self.g[5])
#         #     daoe = str(self.g[6])
#         #     # print("@@@@@@@@@@@@@@@@",daoe)

#         # df8 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf',
#         # pages = '4',
#         # encoding='ISO-8859-1',
#         # area=(60.818,30.0,751.613,580.035),
#         # guess=False,
#         # pandas_options={'header': 'infer'})
#         # for _, row in df8.iterrows():
#         #     self.h = row.tolist()

#         #     lic_no = str(self.h[0]).replace('nan','')
#         #     print("!!!!!!!!!!!!!!!!!!1",lic_no)
#         #     # fname = str(self.h[1])+" "+ str(self.h[2])+" "+str(self.h[3])
#         #     # lname = fname.replace('nan','')
#         #     # lic_type = "Funeral Director License"
#         #     # daob = str(self.h[5])
#         #     # daoe = str(self.h[6])
# #-----------------------------------------------------------------------------------------4









#         df9 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf',
#         pages = '1',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(163.845,23.76,574.695,768.24),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df9.iterrows():
#             self.i = row.tolist()

#             lic_no = str(self.i[0]).replace('nan','')
#             cname = str(self.i[1])
#             ad = str(self.i[2])+", "+str(self.i[3])+", "+"NV "+str(self.i[4])
#             addr = ad.replace("nan","")
#             print("#@@#@#@#@#@#@#@#@#@#",addr)
#             phone = str(self.i[5]).replace("nan","")
#             lic_type = str(self.i[6]).replace("nan","")
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date','')
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', '')
#             il.add_value('location_address_string', addr)
#             il.add_value('company_phone', phone)
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', '')
#             il.add_value('company_name', cname)
#             yield il.load_item()
            



#         df10 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf',
#         pages = '2',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(68.805,30.0,576.675,762.6),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df10.iterrows():
#             self.j = row.tolist()

#             lic_no = str(self.j[0]).replace('nan','')
#             cname = str(self.j[1])
#             ad = str(self.j[2])+", "+str(self.j[3])+", "+"NV "+str(self.j[4])
#             addr = ad.replace("nan","")
#             phone = str(self.j[5]).replace("nan","")
#             lic_type = str(self.j[6]).replace("nan","")
#             print("#####################",lic_type)
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date','')
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', '')
#             il.add_value('location_address_string', addr)
#             il.add_value('company_phone', phone)
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', '')
#             il.add_value('company_name', cname)
#             yield il.load_item()

#         df11 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf',
#         pages = '3',delimeter=',',
#         encoding='ISO-8859-1',
#         area=(68.805,30.0,576.675,762.6),
#         guess=False,
#         pandas_options={'header': 'infer'})
#         for _, row in df11.iterrows():
#             self.k = row.tolist()

#             lic_no = str(self.k[0]).replace('nan','')
#             cname = str(self.k[1])
#             ad = str(self.k[2])+", "+str(self.k[3])+", "+"NV "+str(self.k[4])
#             addr = ad.replace("nan","")
#             phone = str(self.k[5]).replace("nan","")
#             lic_type = str(self.k[6]).replace("nan","")
#             print("#####################",lic_type)
#             il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
#             il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
#             il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
#             il.add_value('sourceName', 'NV_Funeral_Licenses')
#             il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
#             il.add_value('permit_lic_no', lic_no)
#             il.add_value('permit_lic_eff_date','')
#             il.add_value('permit_subtype', lic_type)
#             il.add_value('person_name', '')
#             il.add_value('location_address_string', addr)
#             il.add_value('company_phone', phone)
#             il.add_value('permit_type', 'cemetery_funeral_license')
#             il.add_value('permit_lic_exp_date', '')
#             il.add_value('company_name', cname)
#             yield il.load_item()












        





















        # self.state['items_count'] = self.state.get('items_count', 0) + 1
        # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response)
        # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars)
        # #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp())
        # il.add_value('sourceName', 'nv_funeral_licenses')
        # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/')
        # il.add_xpath('permit_lic_no', '')
        # il.add_xpath('permit_lic_eff_date', '')
        # il.add_xpath('permit_subtype', '')
        # il.add_xpath('person_name', '')
        # il.add_xpath('location_address_string', '')
        # il.add_xpath('company_phone', '')
        # il.add_xpath('permit_type', '')
        # il.add_xpath('permit_lic_exp_date', '')
        # il.add_xpath('company_name', '')
        # return il.load_item()
Exemplo n.º 32
0
 def test_read_pdf(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv1 = 'tests/resources/data_1.csv'
     df = tabula.read_pdf(pdf_path)
     self.assertTrue(isinstance(df, pd.DataFrame))
     self.assertTrue(df.equals(pd.read_csv(expected_csv1)))
 def pdf_table_to_df(self):
     df = read_pdf(self.get_file_name(), multiple_tables=True, pages='all')
     print(type(df))
     return df
Exemplo n.º 34
0
from tabula import read_pdf
df = read_pdf(r"./credits.pdf",
              encoding="utf-8",
              spreadsheet=True,
              pages="all")
print(df)
#print(df)
Exemplo n.º 35
0
#    response = requests.get(url)
#    filename.write_bytes(response.content)
#####
df_list = []

col2str = {'dtype': str}
kwargs = {
    'output_format': 'dataframe',
    'pandas_options': col2str,
    'stream': True
}

for x in os.listdir('data/rki_wochenberichte/'):
    df_list.append(
        tabula.read_pdf('data/rki_wochenberichte/' + x,
                        pages=['2'],
                        multiple_tables=False,
                        **kwargs)[0])
    print(x)

# clean some misread dataframes

df_list[0] = df_list[0].drop(df_list[0].index[[7, 9]])
df_list[0].loc[df_list[0].Bundesland.isna(),
               'Bundesland'] = 'Mecklenburg-Vorpommern'
df_list[2] = pd.DataFrame(np.vstack([df_list[2].columns, df_list[2]]))
df_list[2].columns = df_list[1].columns
df_list[2].columns = df_list[0].columns

for x in range(0, 18):
    print(x)
    df_list[x]['date'] = datelist[x]
Exemplo n.º 36
0
Python 3.6.8 (default, Jan 14 2019, 11:02:34) 
[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux
Type "help", "copyright", "credits" or "license()" for more information.
>>> from tabula import read_pdf
>>> df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf")
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/tabula/wrapper.py", line 108, in read_pdf
    output = subprocess.check_output(args)
  File "/usr/lib/python3.6/subprocess.py", line 356, in check_output
    **kwargs).stdout
  File "/usr/lib/python3.6/subprocess.py", line 423, in run
    with Popen(*popenargs, **kwargs) as process:
  File "/usr/lib/python3.6/subprocess.py", line 729, in __init__
    restore_signals, start_new_session)
  File "/usr/lib/python3.6/subprocess.py", line 1364, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'java': 'java'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<pyshell#1>", line 1, in <module>
    df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf")
  File "/usr/local/lib/python3.6/dist-packages/tabula/wrapper.py", line 111, in read_pdf
    raise JavaNotFoundError(JAVA_NOT_FOUND_ERROR)
tabula.errors.JavaNotFoundError: `java` command is not found from this Python process. Please ensure Java is installed and PATH is set for `java`
>>> df =read_pdf("/home/mark/Downloads/451401XXXXXX3663-2019May09-2019Jun10.pdf")
>>> df
    MAY 06  MAY 10  ...    -$770.00                                       Unnamed: 4
0      NaN     NaN  ...         NaN                Points adjusted this statement 53
1      NaN     NaN  ...         NaN                                              NaN
def contra_costa_p14(_):
    # TODO(charles-difazio): Grab this in a single parse and then split
    gov_df = tabula.read_pdf(
        'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(465, 483)))
    house_df = tabula.read_pdf(
        'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(483, 495)))
    assembly_df = tabula.read_pdf(
        'http://www.cocovote.us/wp-content/uploads/14Jun03_ContraCostaStatementofVote.pdf', pages=list(range(495, 501)))

    # Rename candidate columns
    gov_df.columns = ['precinct'] + \
        gov_df['Unnamed: 4'][3:6].tolist() + ['dummy']
    # Drop NA rows and total row
    gov_df = gov_df[gov_df.columns[:-1]].dropna(how='any')[:-1]

    # Tidy data
    gov_df = pd.melt(gov_df, id_vars='precinct', value_vars=gov_df.columns[1:],
                     var_name='candidate', value_name='votes')
    gov_df.votes = gov_df.votes.astype(np.int64)
    gov_df = gov_df[gov_df.votes != 0].assign(county='Contra Costa',
                                              office='Governor')
    gov_df['party'] = gov_df.candidate.apply(lambda x: p[x])
    gov_df.precinct = gov_df.precinct.apply(lambda x: x.split(' ')[1])

    house_df = house_df.rename(columns={'2014 STATEWIDE PRIMARY': 'precinct',
                                        'Unnamed: 1': 'Virginia Fuller'})
    house_df['Virginia Fuller'] = pd.to_numeric(
        house_df['Virginia Fuller'], errors='coerce')
    house_df = house_df.dropna(how='any')[:-1]
    house_df = house_df[house_df.precinct != 'CANDIDATES']

    house_df = pd.melt(house_df, id_vars='precinct', value_vars=house_df.columns[1:],
                       var_name='candidate', value_name='votes')
    house_df.votes = house_df.votes.astype(np.int64)
    house_df = house_df[house_df.votes != 0].assign(county='Contra Costa',
                                                    office='U.S. House',
                                                    district='11')
    house_df['party'] = house_df.candidate.apply(lambda x: p[x])
    house_df.precinct = house_df.precinct.apply(
        lambda x: x.split(' ')[1])

    c = assembly_df['Unnamed: 2'][3:5].tolist()
    assembly_df[c[0]], assembly_df[c[1]] = assembly_df[
        'Unnamed: 1'].str.split(' ', n=1).str
    assembly_df = assembly_df.rename(
        columns={'2014 STATEWIDE PRIMARY': 'precinct'})
    assembly_df[c[0]] = pd.to_numeric(assembly_df[c[0]], errors='coerce')
    assembly_df[c[1]] = pd.to_numeric(assembly_df[c[1]], errors='coerce')
    assembly_df = assembly_df[['precinct'] + c]
    assembly_df = assembly_df.dropna(how='any')[:-1]

    assembly_df = pd.melt(assembly_df, id_vars='precinct', value_vars=assembly_df.columns[1:],
                          var_name='candidate', value_name='votes')
    assembly_df.votes = assembly_df.votes.astype(np.int64)
    assembly_df = assembly_df[assembly_df.votes != 0].assign(county='Contra Costa',
                                                             office='State Assembly',
                                                             district='14')
    assembly_df['party'] = assembly_df.candidate.apply(lambda x: p[x])
    assembly_df.precinct = assembly_df.precinct.apply(
        lambda x: x.split(' ')[1])

    return gov_df.to_dict(orient='records') + \
        house_df.to_dict(orient='records') + \
        assembly_df.to_dict(orient='records')
Exemplo n.º 38
0
from tabula import read_pdf

## a build-in ssl module of python
import ssl

url = "url.pdf"

try:
    df = read_pdf(
        "http://www.sed1.org/afterschool/tookits/science/pdf/ast_sci_data_table_sample.pdf"
    )

    # print the data frame (pandas)
    print(df)

except Excemtion as e:
    print("Error {}".format(e))
Exemplo n.º 39
0
# https://towardsdatascience.com/scraping-table-data-from-pdf-files-using-a-single-line-in-python-8607880c750

import tabula

infile_name = '/Users/craig/Documents/GEORGIAN-MEGRELIAN-LAZ-SVAN-ENGLISH_DICTIONARY.pdf'

table = tabula.read_pdf(infile_name, pages=1)
table[0]
Exemplo n.º 40
0
import tabula
import pandas as pd
# import time

# start_time = time.time()
tb_name="test"
df = tabula.read_pdf(tb_name+".pdf", pages='all', multiple_tables=True)
df = pd.DataFrame(df)
# df.to_excel('test.xlsx', header=False, index=False)

# df=pd.read_csv(tb_name+".csv", error_bad_lines=False, header=None)

# res = pd.DataFrame()
# n=len(df)

# arr = df.values
# idx = []

# for i in range(0,n):
# 	t=str(arr[i][0])[0:2]

# 	if (t!="31"):
# 		idx.append(i)

# df.drop(df.index[idx], inplace=True)
# df.to_csv(tb_name+'.csv', header=False, index=False)

# Printing Time
# print("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 41
0
import tabula
import pandas as pd

# Read pdf into DataFrame
df = tabula.read_pdf("test.pdf", options)

# Read remote pdf into DataFrame
df2 = tabula.read_pdf("https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/arabic.pdf")

# convert PDF into CSV
tabula.convert_into("test.pdf", "output.csv", output_format="csv")

# convert all PDFs in a directory
tabula.convert_into_by_batch("input_directory", output_format='csv')
Exemplo n.º 42
0
def get_pdf_dict(lsdyna_manual_file_path, page_numbers, only_page_num_dict=False):
	'''
	@description				: returns a dictionary of keywords and corresponding tables and page numbers in lists

	@lsdyna_manual_file_path	: the path to the LS_Dyna Manual
	@page_numbers				: list of page numbers for which keys and tables must be extracted; if "All" is given, all pages are analyzed
	@only_page_num_dict			: (default:False) if this is set to True, only page numbers are written out for given key word manual
	@returns					: returns a dictionary of keywords and corresponding tables and page numbers in lists
	'''

	ls_key_dict = {}
	ls_file = open(lsdyna_manual_file_path, 'rb')
	ls_file_contents = pdf.PdfFileReader(ls_file)

	if isinstance(page_numbers, int):
		page_numbers = [page_numbers]

	if page_numbers == "All":
		total_pages = ls_file_contents.getNumPages()
		page_numbers = list(range(1, total_pages+1))

	print("\nLooking for tables in the pages - " + str(page_numbers) + ".\n")

	error_pages = []
	for page_num in page_numbers:
		page_key = get_page_key(page_num, ls_file_contents)
		if page_key == "":
			continue

		if ls_key_dict.get(page_key):
			ls_key_dict[page_key]['page_numbers'].append(page_num)
		else:
			ls_key_dict[page_key] = {'tables' : [], 'page_numbers' : [page_num]}

		if only_page_num_dict:
			continue

		#page nos. for this function start from 1
		try:
			page_tables = tabula.read_pdf(lsdyna_manual_file_path, pages=[page_num], multiple_tables=True)
		except:
			error_pages.append(page_num)
			page_tables = None
		if not page_tables:
			ls_key_dict[page_key]['page_numbers'].pop(-1)
			continue

		#collecting only LS Dyna key cards (the first cell in the table has the phrase "Card")
		valid_page_tables = []
		for table in page_tables:
			try:
				if "Card" in table[0][0]:
					#TODO: this area is meant to cleanup tables; try the keycard *DEFINE_COORDINATE_SYSTEM and notice table on Page 1560
					valid_page_tables.append(table)
			except:
				continue

		if valid_page_tables:
			ls_key_dict[page_key]['tables'].append(valid_page_tables)
		else:
			ls_key_dict[page_key]['page_numbers'].pop(-1)

	if error_pages:
		print("There were errors while extracting tables from pages " + str(error_pages))

	return ls_key_dict
Exemplo n.º 43
0
import tabula
import camelot
import json
import pymongo
import math

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client.PreCog_DB_a

# File 1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf -------------------------

# Get table
path = "./Rec_Task/1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf"
dfs = tabula.read_pdf(path, pages='all')

# conversion to required format
dfs_formatted = dfs[0].to_dict()
for i,v in dfs_formatted.items():
    for j,k in v.items():
        dfs_formatted[i] = k

# insert into collection
tables_1c1edeee = db["tables_1c1edeee"]
tables_1c1edeee.insert(dfs_formatted)

# -------------------------

# File 1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf -------------------------

Get table
path = "./Rec_Task/a6b29367-f3b7-4fb1-a2d0-077477eac1d9.pdf"
Exemplo n.º 44
0
import os
import tabula


files = os.listdir()
pdf_files = list(filter(lambda x: x.endswith(".pdf"), files))


for file in pdf_files:
    try:
        data = tabula.read_pdf(file, spreadsheet=True, pages='all')
        data.to_csv(file.replace(".pdf", ".csv"),
                    index=False, quoting=1, encoding='utf8')
        print("File done-", file)
        print()
    except:
        print("Reading error -", file)
        print()
Exemplo n.º 45
0
#lkoval
#6-19-19

from tabula import read_pdf
import pandas as pd
import string

#Table 5
table_5=read_pdf("document_1374295.pdf", pages="18", lattice=True, pandas_options={'header': None})
table_5["raw_chem_name"]=table_5.iloc[1:,0]
table_5["raw_cas"]=table_5.iloc[1:,2]
table_5=table_5.dropna(subset=["raw_chem_name"])
table_5=table_5[["raw_chem_name","raw_cas"]]

clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
for j in range(0, len(table_5)):
    table_5["raw_chem_name"].iloc[j]=str(table_5["raw_chem_name"].iloc[j]).strip().lower().replace(".","").replace("α","alpha").replace("β","beta")
    table_5["raw_chem_name"].iloc[j]=clean(str(table_5["raw_chem_name"].iloc[j]))
    if len(table_5["raw_chem_name"].iloc[j].split())>1:
        table_5["raw_chem_name"].iloc[j]=" ".join(table_5["raw_chem_name"].iloc[j].split())
    if len(str(table_5["raw_cas"].iloc[j]).split())>1:
        table_5["raw_cas"].iloc[j]=" ".join(str(table_5["raw_cas"].iloc[j]).split())

table_5["data_document_id"]="1374295"
table_5["data_document_filename"]="DCPS_83_a.pdf"
table_5["doc_date"]="2007"
table_5["raw_category"]=""
table_5["cat_code"]=""
table_5["description_cpcat"]=""
table_5["cpcat_code"]=""
table_5["cpcat_sourcetype"]="ACToR Assays and Lists"
Exemplo n.º 46
0
import numpy as np
import pandas as pd
import geopandas as gpd
import tabula
import matplotlib.pyplot as plt
plt.style.use('seaborn')

yestersay = '4/16/20'
today = '4/17/20'

df = tabula.read_pdf('data/mexico.pdf',
                     pages='all',
                     output_format='dataframe',
                     multiple_tables=False)
df[0].to_csv('data/covidmxraw.csv')

confirmedraw = pd.read_csv('data/covidmxraw.csv', index_col=0)
print(confirmedraw)
confirmedraw = confirmedraw.drop(
    ['N° Caso', 'Identificación de COVID-\r19 por RT-PCR en tiempo\rreal'],
    axis=1)
confirmedraw = confirmedraw.rename(
    columns={'Fecha de Inicio\rde síntomas': 'Fecha de Inicio de síntomas'})
confirmedraw.to_csv('data/confirmedraw.csv')

confirmed = pd.read_csv('data/time_series_confirmed_MX.csv', index_col=0)
current = confirmedraw.groupby('Estado')['Sexo'].count()
last = confirmed[yestersay]

confirmed[today] = last + (current - last)
confirmed[today][-1] = current.sum()
Exemplo n.º 47
0
 def test_read_pdf_with_java_option(self):
     pdf_path = 'tests/resources/data.pdf'
     expected_csv1 = 'tests/resources/data_1.csv'
     self.assertTrue(tabula.read_pdf(pdf_path, pages=1, java_options=['-Xmx256m']
                                     ).equals(pd.read_csv(expected_csv1)))
Exemplo n.º 48
0

def fill_t5008_21(ff_driver, box_21):
    div = find_fill_element_with_text(ff_driver, "Proceeds of disposition or settlement amount")
    div.find_element_by_tag_name("input").send_keys(box_21)


if __name__ == '__main__':
    driver = webdriver.Firefox()
    driver.get("https://secure.ufile.ca/T1-2018/Interview?file=e0dfbb3b")
    sleep(5)
    driver.find_element_by_name("Username").send_keys(USERNAME)
    driver.find_element_by_name("Password").send_keys(PASSWORD)
    driver.find_elements_by_class_name("blue-button")[0].click()
    sleep(5)
    driver.find_element_by_class_name("filefieldmain").find_element_by_tag_name("a").click()
    sleep(3)

    df = read_pdf("T5008.pdf", pages=2)

    list_t5008 = []
    for row in df.iterrows():
        if row[1]["Box 13"] == "USD":
            list_t5008.append(row[1])

    for t5008 in list_t5008:
        add_t5008(driver)
        sleep(3)
        fill_t5008(driver, t5008)
        sleep(3)
Exemplo n.º 49
0
import tabula
import os

tables = tabula.read_pdf("1710.05006.pdf", pages="all")

# 폴더를 만들어 저장
folder_name = "171005006-tables"
if not os.path.isdir(folder_name):
    os.mkdir(folder_name)

# 각각의 테이블을 엑셀 파일로 저장
for i, table in enumerate(tables, start=1):
    table.to_excel(os.path.join(folder_name, f"table_{i}.xlsx"), index=False)

# 저장과 변환을 한 번에 가능
# csv, json, tsv 지원
# tabula.convert_into("1710.05006.pdf", "output.csv", output_format="csv", pages="all")

# pdfs 라는 폴더가 필요
# tabula.convert_into_by_batch("pdfs", output_format="csv", pages="all")