def tabula_table_generator(file, start_page, pdf_name_data, pdf_name_count, page_break_count): '''This function generates the list of tables from the pages where required heading (like 'Precision') data presented in pdf Parameters: file(str): filename or the DOWID of the document start_page(number): Page number of pdf from where heading (like 'Precision') data started pdf_name_data(list): This is the set of pdf id along with some texts presented in the extracted heading text pdf_name_count(number): Count that represents how many times the pdf name occurred in the extracted heading text page_break_count(number): Count of pages having required heading (like 'Precision') data returns: tabledata(list): List of tables presented in the pages of pdf in which required heading (like 'Precision') data presented ''' tabledata = [] if (pdf_name_data != []) and (pdf_name_count != 0): new_pages = str( str(start_page + 1) + "-" + str(start_page + pdf_name_count + 1)) elif page_break_count != 0: new_pages = str( str(start_page + 1) + "-" + str(start_page + page_break_count + 1)) else: new_pages = str(start_page + 1) tabledata = read_pdf(file, output_format="dataframe", pages=new_pages, lattice=True, multiple_tables=True, encoding='latin-1', position="absolute") if tabledata == []: tabledata = read_pdf(file, output_format="dataframe", pages=new_pages, lattice=False, multiple_tables=True, encoding='latin-1', position="absolute") return tabledata
def multiple(file_path, columns): """ Convert pdf table by pages """ DATA = pd.DataFrame(columns=list(range(0, 7))) tables = read_pdf( file_path, # encoding='windows-1251', encoding='utf-8', lattice=True, pandas_options={'header': None}, pages='all', multiple_tables=True) for t in tables: # Because of mistakes while pasring we need to delete empty columns sometimes if t.shape[1] > 7: t.dropna(axis='columns', how='all', inplace=True) # This solution works only if there is at least one full row on the page t.rename(columns=col_name(t.keys(), list(range(0, 7))), inplace=True) continue DATA = DATA.append(t, ignore_index=True) DATA.rename(columns=col_name(DATA.keys(), columns), inplace=True) DATA.drop(0, inplace=True) return DATA
def main(): getCleanPdf() # By now, the clean pdf is clean.pdf df = wrapper.read_pdf("clean.pdf") writer = pd.ExcelWriter('out.xlsx', engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') writer.save()
def download_extract(paper, extract_figure=False, extract_table=False): if paper.pages >= 0 and paper.table >= 0: return False paper_info = { 'pdf_url': paper.url, 'title': paper.title, } api_paper = arxiv.query(id_list=[paper.arvixID])[0] if 'pdf_url' not in api_paper: return False pdf_url = api_paper['pdf_url'] # pdf_url = 'https://arxiv.org/pdf/' + paper.url.split('/')[-1] +'.pdf' file_path = os.path.join(store_path, paper.paperId + '.pdf') # if not os.path.isfile(file_path): urllib.request.urlretrieve(pdf_url, file_path) if extract_table: df = wrapper.read_pdf(file_path, multiple_tables=True, pages='all') table_count = len(df) del df if extract_figure: figure_count, page_count = get_figure_count(file_path) modified = False if paper.pages == -1: modified = True paper.pages = page_count else: page_count = paper.pages if paper.table == -1: modified = True paper.table = table_count if os.path.exists(file_path): os.remove(file_path) if modified: Paper.update(table=table_count, pages=page_count).where( Paper.arvixID == paper.arvixID).execute() # paper.save() return modified # api_paper = arxiv.query(id_list=[paper.arvixID])[0] # if 'pdf_url' not in api_paper: # return False # pdf_url = api_paper['pdf_url'] texts = extract_text(file_path, pdf_url) if texts is None: print("PDF either do not exists or failed : ", paper.url) return False affiliation = [] for text in texts.split(): if re.match("[^@]+@[^@]+\.[^@]+", text): domain_name = text.split('@')[-1] affiliation.append(domain_name) if len(affiliation) > 0: Paper.update(affiliation=affiliation).where( Paper.arvixID == paper.arvixID).execute() return False
def pdfProcess(inputFolder, url): #다른 사이트에서 pdf 데이터를 받을 때는 함수를 다시 만들어야 해요 #html-parsing 작업_새로 업데이트된 pdf 파일의 일련번호를 가져와야 해요 resp = requests.get( "http://fsc.go.kr/info/trd_list.jsp?menu=7230000&bbsid=BBS0069") resp.encoding = 'utf-8' html = resp.text bs = BeautifulSoup(html, 'html.parser') #프로그램 실행시마다 최신데이터를 데이터를 가져오도록 하는 코드 #데이터 추출하기 originalData = bs.select( "#contents > div.board > table > tbody > tr > td > a") convertedData = str(originalData[2]) dailyUpdated = convertedData.split('"')[1].split('amp;')[1] url = url + "&" + dailyUpdated # & 를 안 써줘서 직업이 안됐었어.... df2 = wrapper.read_pdf(url, multiple_tables=True, pages="all", pandas_options={"header": 0}) #컬럼 헤더에 특수문자 제거 colName = df2[0].columns.tolist() for i in range(0, len(df2)): colName[i] = df2[i].columns.tolist() for j in range(0, len(colName)): clean = re.sub('[-=+,#/\?:^$.%@*\"※~&ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', colName[i][j]) colName[i][j] = clean df2[i].columns = colName[i] #csv 파일로 저장하기 #파일이름에 반영할 현재 날짜 구하기 currentYear = datetime.today().year currentMonth = datetime.today().month currentDate = datetime.today().day currentYear = str(currentYear)[-2:] if (currentMonth < 10): currentMonth = "0" + str(currentMonth) else: currentMonth = str(currentMonth) if (currentDate < 10): currentDate = "0" + str(currentDate) else: currentDate = str(currentDate) today = currentYear + currentMonth + currentDate #각각의 표를 파일로 저장 (to DB : postgres) engine = create_engine('postgresql://*****:*****@192.168.110.111:5432/kopo') for i in range(0, len(df2)): fileName = inputFolder + "_" + today + "_" + str(i) df2[i].to_sql(fileName, engine, if_exists='replace', index=False)
def pdfParser(pdfFile): df = wrapper.read_pdf( pdfFile, spreadsheet=True, # (top,left,bottom,right). #area = (136.43, 58.64, 602.63, 554.93)) area=(129.52, 59.99, 602.63, 553.13)) #top, left, bottom, right df = pd.DataFrame(data=df) return (df)
def dataPrep(url): #download file fileName = 'cb10.pdf' pdf = urllib.request.urlretrieve(url, fileName) tables = wrapper.read_pdf(fileName, lattice=True, multiple_tables=True) #select largest table largest = pd.DataFrame() for i in tables: if i.size >= largest.size: largest = i #get rid of the top columns names table = largest.copy() dayOfWeekRow = None for index, row in table.iterrows(): dayChars = [d[:2] for d in calendar.day_name[:]] rowString = [ str(i).strip('\r').title()[:2] for i in list(row) if not pd.isna(i) ] if len(set(dayChars) & set(rowString)) >= 7: dayOfWeekRow = index table.columns = table.iloc[dayOfWeekRow] table = table[dayOfWeekRow + 1:] #drop nan column table = table.loc[:, table.columns.notnull()] #drop nan with blank string table = table.fillna('') events = [] for column in table: dayOfWeek = column values = table[column].values event = {} #if day in first row has value init weeknumber to 0 otherwise init to 1 if table[:1].get(column).values != '': weekNumber = 0 else: weekNumber = 1 #go though each cell of the column and check for an integer, #if there is an integer start day otherwise just append the text for cell in values: date = re.search(r"^(\d*)", cell).group(0) if (date): weekNumber += 1 event[column] = {'text': cell, 'weekNumber': weekNumber} if event[column].items not in events: events.append(event) event = {} return events
def pdfocr(): #pdf.image(imgpath,x,y,w,h) #pdf.output("/home/caratred/text.pdf", "F") a = wrapper.read_pdf("/home/caratred/Downloads/mrzlines.pdf", pages="1", index=False, stream=True, squeeze=True, header=None) print(a) b = a.iloc[:, 0:3] print(b)
def all_one(file_path, columns): """ Convert pdf table to DataFrame wholly """ df = read_pdf( file_path, # encoding='windows-1251', encoding='utf-8', lattice=True, pandas_options={'header': columns}, pages='all') return df
def vertical_column_names(pdf_name, column_names_area): ''' Function to extract vertical column names by rotating the file and using tabula ''' rotated_pdf_name = rotate_pdf_90(pdf_name) list_of_column_names = wrapper.read_pdf( rotated_pdf_name, stream=True, multiple_tables=True, pages='all', area=(column_names_area[0], column_names_area[1], column_names_area[2], column_names_area[3]), guess=False) return list_of_column_names
def pdf_to_text(pdf_name, columns_vertical_bool, tables_area, offices_area, column_names_area): '''Function to read text from pdf using tabula library into three separate lists (list of tables, list of offices and list of column headers)''' # get all tables (area of row only = (171.743,0.383,790.628,611.618)) list_of_tables = wrapper.read_pdf(pdf_name, stream=True, multiple_tables=True, pages='all', area=(tables_area[0], tables_area[1], tables_area[2], tables_area[3]), guess=False) # get all office names list_of_offices = wrapper.read_pdf(pdf_name, stream=True, multiple_tables=True, pages='all', area=(offices_area[0], offices_area[1], offices_area[2], offices_area[3]), guess=False) # get all column names # if there are vertical columns, run the helper function (vertical_column_names) if columns_vertical_bool == "y": list_of_column_names = vertical_column_names(pdf_name, column_names_area) # if there aren't vertical columns, extract normally elif columns_vertical_bool == "n": list_of_column_names = wrapper.read_pdf( pdf_name, stream=True, multiple_tables=True, pages='all', area=(column_names_area[0], column_names_area[1], column_names_area[2], column_names_area[3]), guess=False) return list_of_tables, list_of_offices, list_of_column_names
def getPdf(self): tables = wrapper.read_pdf("qw.pdf", multiple_tables=True, pages='all', encoding='utf-8', spreadsheet=True) i = 1 row_array = [] row_array2 = [] for table in tables: table.columns = table.iloc[0] table = table.reindex(table.index.drop(0)).reset_index(drop=True) table.columns.name = None #To write Excel table.to_excel('output' + str(i) + '.xlsx', header=True, index=False) workbook = xlrd.open_workbook('output2.xlsx') worksheet = workbook.sheet_by_name('Sheet1') num_rows = 5 #worksheet.nrows - 1 curr_row = 4 while curr_row < num_rows: row = worksheet.col(curr_row) #print(row) row2 = worksheet.col(curr_row - 1) row_array += row row_array2 += row2 curr_row += 1 i = i + 1 for i in range(len(row_array)): #print(row[i].split("'")) #k=string(row[i]) if row_array[i].value != '': if row_array[i].value == 'Grade': self.semesterCount += 1 self.gradesArray.append(int(self.semesterCount)) self.creditsArray.append('D') else: print(row_array[i].value) gradeSingle = str(row_array[i].value) creditSingle = int(row_array2[i].value) self.gradesArray.append(gradeSingle) self.creditsArray.append(creditSingle) #gradesArray+=row_array[i].value #creditsArray+=row_array2[i].value print(self.gradesArray) print(self.creditsArray)
def pdfProcess(inputFolder, url): #다른 사이트에서 pdf 데이터를 받을 때는 함수를 다시 만들어야 해요 #html-parsing 작업_새로 업데이트된 pdf 파일의 일련번호를 가져와야 해요 resp = requests.get( "http://fsc.go.kr/info/trd_list.jsp?menu=7230000&bbsid=BBS0069") resp.encoding = 'utf-8' html = resp.text bs = BeautifulSoup(html, 'html.parser') #프로그램 실행시마다 최신데이터를 데이터를 가져오도록 하는 코드 #데이터 추출하기 originalData = bs.select( "#contents > div.board > table > tbody > tr > td > a") convertedData = str(originalData[2]) dailyUpdated = convertedData.split('"')[1].split('amp;')[1] url = url + "&" + dailyUpdated # & 를 안 써줘서 직업이 안됐었어.... df2 = wrapper.read_pdf(url, multiple_tables=True, pages="all", pandas_options={"header": 0}) #csv 파일로 저장하기 #파일이름에 반영할 현재 날짜 구하기 currentYear = datetime.today().year currentMonth = datetime.today().month currentDate = datetime.today().day currentYear = str(currentYear)[-2:] if (currentMonth < 10): currentMonth = "0" + str(currentMonth) else: currentMonth = str(currentMonth) if (currentDate < 10): currentDate = "0" + str(currentDate) else: currentDate = str(currentDate) today = currentYear + currentMonth + currentDate #각각의 표를 파일로 저장 for i in range(0, len(df2)): fileName = inputFolder + "_" + today + "_" + str(i) + '.csv' df2[i].to_csv("../../data/outbound/" + inputFolder + "/" + fileName, index=False, encoding="ms949")
def vat_return_reader(file_name): """vat_return_reader(file_name) It creates a dataframe where key is file name and values are last column(2) of the VAT return table. In dataframe index name is file_name which function used""" df = wrapper.read_pdf(file_name, pandas_options={'header': None}, encoding="cp775") print("tabula_read executed") column_data = df.loc[0:45, 2] column_name = df.loc[0:45, 1] #replaces '.' with ',' in order to be treated as nummeric column_data = column_data.astype(str).str.replace(',', '.') #converts str to float column_data = pd.to_numeric(column_data, errors='coerce') # renames row index as file names column_data = column_data.rename(index=f"{file_name}") return column_data
from tabula import wrapper import pandas as pd import numpy as np from datetime import timedelta, datetime import tkinter as tk from tkinter import filedialog import re, shutil tkroot = tk.Tk() tkroot.withdraw() file = filedialog.askopenfilename() df = wrapper.read_pdf(file, pages='all', error_bad_lines=False) print(df) writer = pd.ExcelWriter('output.xlsx') df.to_excel(writer,'Sheet1') writer.save()
from tabula import wrapper wrapper.read_pdf("", area=[])
import numpy as np from datetime import timedelta, datetime import tkinter as tk from tkinter import filedialog tkroot = tk.Tk() tkroot.withdraw() file = filedialog.askopenfilename() invoice = file[0:7] #tkinter etc df = wrapper.read_pdf(file) #rename columns df = df.rename( columns={ 'AGCO International GmbH': 'data', 'Unnamed: 1': 'data1', 'Unnamed: 2': 'data2', }) #get invoice from filename #df['connection'] = # new df with relevant rows
def pdfConverter(self): from tabula.wrapper import read_pdf return read_pdf(self.path, pages='all', lattice=True)
from tabula import wrapper import os print(os.listdir(".")) filepath = r"Birddogs-BofA-Bank-Statement-April.'17.pdf" tables = wrapper.read_pdf(filepath, pages='all', silent=True, pandas_options={ 'header': None, 'error_bad_lines': False, 'warn_bad_lines': False }) # tables = wrapper.read_pdf(filepath,multiple_tables=True,pages='all') folder = "results" if os.path.isdir(folder) == False: os.makedirs(folder) i = 1 for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) for r in (tables.values.tolist()): print(r) # for table in tables: # table.to_excel(os.path.join(folder,'Results'+str(i)+'.xlsx'),index=False)
# Read pdf into DataFrame # df = tabula.read_pdf("test.pdf", options) from tabula import wrapper #df = wrapper.read_pdf('../sample-data/B1705_2000560947.pdf', output_format='dataframe', guess=False) #df = wrapper.read_pdf('../sample-data/B1705_2000560948.pdf', output_format='dataframe', guess=False) #df = wrapper.read_pdf('../sample-data/B1705_2000604352 (1).pdf', output_format='dataframe', guess=False) collection_frame = [] for file in os.listdir("../sample-data/"): if file.endswith(".pdf"): #print(file) filepath = "../sample-data/{}".format(file) print(filepath) df = wrapper.read_pdf(filepath, output_format='dataframe', guess=False) list(df) RBC_Value = df['Unnamed: 0'][df['Unnamed: 0'].str.contains( "RBC").__eq__(True)] Fibrinogen_Value = df['Unnamed: 0'][df['Unnamed: 0'].str.contains( "FIBRINOGEN").__eq__(True)] #getting rows from start to finish test = df['Unnamed: 0'].str.split(' ', expand=True)[1][11:32] #print(test) test2 = df['Unnamed: 0'].str.split(' ', expand=True)[2][18] test3 = df['Unnamed: 0'].str.split(' ', expand=True)[3][30] #print(test3) platelet_appearance = df['Unnamed: 0'].str.split(' ',
from tabula import wrapper import xlrd import openpyxl tables = wrapper.read_pdf("qw.pdf", multiple_tables=True, pages='all', encoding='utf-8', spreadsheet=True) i = 1 row_array = [] row_array2 = [] semesterCount = 0 for table in tables: table.columns = table.iloc[0] table = table.reindex(table.index.drop(0)).reset_index(drop=True) table.columns.name = None #To write Excel table.to_excel('output' + str(i) + '.xlsx', header=True, index=False) workbook = xlrd.open_workbook('output' + str(i) + '.xlsx') worksheet = workbook.sheet_by_name('Sheet1') num_rows = 5 #worksheet.nrows - 1 curr_row = 4 while curr_row < num_rows: row = worksheet.col(curr_row) #print(row) row2 = worksheet.col(curr_row - 1) row_array += row row_array2 += row2 curr_row += 1 i = i + 1
def check_tables(file_name): df = wrapper.read_pdf(str(file_name), pages='all') if len(df) == 0: return False else: return True
def parser(fn, path): famheader = { 0: 'first_name', 1: 'last_name', 2: 'place_of_birth', 3: 'date_of_birth', 4: 'relationship' } propheader = { 0: 'owner', 1: 'type', 2: 'purchase_date', 3: 'purchase_type', 4: 'price', 5: 'location_area', 6: 'owners' } fam = 'თქვენი ოჯახის წევრების (მეუღლე, არასრულწლოვანი შვილი, (გერი), თქვენთან მუდმივად მცხოვრები პირი) მონაცემები' prop = 'თქვენი, თქვენი ოჯახის წევრის საკუთრებაში არსებული უძრავი ქონება' rdate = re.compile(r'\s*\d\d\.\d\d.\d{4}$') place_and_date = 'დაბადების ადგილი, დაბადების თარიღი:\s*' dcl_data = {} FDF = make_fdf() PDF = make_pdf() dfs = read_pdf(os.path.join(path, fn), pages='all', lattice=True, multiple_tables=True) inf_flag = dfs[1].loc[0, 0] p_n_d = re.search(place_and_date + '(.+)', inf_flag) if p_n_d: pnd = p_n_d.group(1) place = re.sub(rdate, '', pnd) dcl_data['palse_of_birth'] = place date = re.search(rdate, pnd) if date: date = date.group().strip() dcl_data['date_of_birth'] = date for df in dfs[2:]: if df.shape != (0, 0): if df.loc[0, 0] == fam: FDF = FDF.append(get_info(df, famheader)) if df.loc[0, 0] == prop: PDF = PDF.append(get_info(df, propheader)) if FDF.shape != (0, 0): FDF.fillna('', inplace=True) FDF.reset_index(drop=True, inplace=True) FDF = fullrows(FDF, 'first_name') FDF FDF.loc[:, 'first_name_lat'] = FDF.first_name.apply(transli) FDF.loc[:, 'last_name_lat'] = FDF.last_name.apply(transli) FDF.loc[:, 'place_of_birth_lat'] = FDF.place_of_birth.apply(transli) FDF.loc[:, 'relationship_lat'] = FDF.relationship.apply(transli) dcl_data['family'] = json.loads( FDF.to_json(orient="records", force_ascii=False)) if PDF.shape != (0, 0): PDF.fillna('', inplace=True) PDF.reset_index(drop=True, inplace=True) PDF = fullrows(PDF, 'owner') for i in PDF[PDF['location_area'].str.endswith( ',')].index.values[::-1] + 1: wow(PDF, i) PDF.loc[:, 'owner'] = PDF.owner.str.replace('\r', ' ') PDF.loc[:, 'location_area'] = PDF.location_area.str.replace('\r', ' ') PDF.loc[:, 'owner_lat'] = PDF.owner.apply(transli) PDF.loc[:, 'type_lat'] = PDF.type.apply(transli) PDF.loc[:, 'purchase_type_lat'] = PDF.purchase_type.apply(transli) PDF.loc[:, 'location_area_lat'] = PDF.location_area.apply(transli) PDF.loc[:, 'owners_lat'] = PDF.owners.apply(transli) dcl_data['purchase'] = json.loads( PDF.to_json(orient="records", force_ascii=False)) return dcl_data
for chunk in r.iter_content(102400): fd.write(chunk) print('...', end="", flush=True) print('[DONE]') # In[5]: # wrapper.convert_into(file_name, (file_name + '.csv'), output_format='csv', pages='all') # In[6]: df = wrapper.read_pdf(file_name, pages='1700-1790') # In[7]: df.head() # In[8]: # df.columns = ['NR. DOSAR', 'DATA ÎNREGISTRĂRII', 'TERMEN', 'SOLUȚIE'] df.columns = ['NR. DOSAR', 'DATA INREGISTRARII', 'TERMEN', 'SOLUTIE']
def out_tables(file, text_between, preprocessed_text, header, corpus, pages, table_start_list, table_end_list): '''This takes table names list as input and returns the tables related to the table names list. Parameters: file(str): filename or the DOWID of the document text_between(str): The text in between the start and end patterns in the text of the pdf preprocessed_text(list)e pdf text data which is already pre processed header(str):String pattern of the header, for example 'Precision' corpus(object): PyPDF2.corpus object file can be used to read the pdf later pages(number): total number of pages in the document table_start_list(list): List of tables referenced in the heading (like 'Precision') text table_end_list(list): List of table names which are expected as the end patterns for table_start_list returns: tabledata(list): List of tables presented in the pages of pdf in which required heading (like 'Table I') data presented. ''' tabledata = [] remove_text = r"\.\.|following tab|table(|s) be|Metablen|in the table(|s)(|\.)|(T|t)able(|s) (be|of)|[a-z][a-z]table|table according" textdata = re.sub(remove_text, "", str(text_between)) preprocessed_text1 = (re.sub(r'see Appendix|' + remove_text, "", temp) for temp in preprocessed_text) if " Table " or " table " or " tables " or " Tables " or " TABLE " or " TABLES " in str( textdata): table_start_list, table_end_list = out_tables_list(text_between) if table_end_list != ['', ' ']: table_start = re.search("Table(|s)\s*\-*\s*\w*", str(text_between)) if table_start != None: j = 0 find = '' find_list = [ 'THE IFORMATION HEREIN', 'THE INFORMATION HEREIN', 'The information herein', 'Appendix' ] text_at_end = '' for word in range(0, len(find_list)): for page in range(0, len(preprocessed_text)): if find_list[word] in preprocessed_text[page]: tablepage = page text_at_end = preprocessed_text[ tablepage:len(preprocessed_text)] find = find_list[word] break if find != '' and table_start != ' ' or '': for table in range(0, len(table_start_list)): table_start = table_start_list[table] table_end = table_end_list[table] if str(table_end) in str(text_at_end): pattern = str(table_start) + "(.+?)" + str( table_end) else: table_end = str( text_at_end)[len(str(text_at_end)) - 8:len(str(text_at_end))] pattern = str(table_start) + "(.+?)" + table_end text_between = str( re.findall(pattern, str(text_at_end))) for k in range(tablepage, len(preprocessed_text)): if table_start in preprocessed_text[k]: new_pages = str(k + 1) + "-" + str(k + 2) if k + 1 == pages: new_pages = str(pages) tabledata = read_pdf(file, output_format="dataframe", pages=new_pages, lattice=True, multiple_tables=True, encoding='latin-1', position="absolute") tabledata = string_comp_match( text_between=str(text_at_end), tabledata=tabledata) if tabledata != []: break return tabledata
import pandas as pd import numpy as np from datetime import timedelta, datetime import tkinter as tk from tkinter import filedialog import re, shutil, csv tkroot = tk.Tk() tkroot.withdraw() file = filedialog.askopenfilename() df = wrapper.read_pdf(file, pages='all') doi = re.compile(r'(\d\d/\d\d/\d\d)') Dateoi = doi.findall(str(df['Unnamed: 2'].str.extract(r'(\d\d/\d\d/\d\d)'))) df = df.rename( columns={ 'AGCO International GmbH': 'data', 'Unnamed: 1': 'data1', 'Unnamed: 2': 'data2', }) #create empty df for dissection #write individual lines to new df invr = re.compile(r'(A0\d{6})')
# data = wrapper.read_pdf("http://gs.cufe.edu.cn/system/resource/storage/download.jsp?mark=REE2MkE4Q0M2QzAwRjRDRjI0Q0JFRTJFRDAyOTk0MzcvMDMxNDc5RDQvOUFEMQ== ") # print(data) import PyPDF2 from tabula import wrapper from Office.save_file import save_dfto_csv # import tabula # pdfpath = r"C:\Users\xfs9619\Desktop\附件1 中央财经大学2019-2020学年第一学期研究生课程表(公共课部分)\中央财经大学2019-2020学年第一学期研究生课程表(学生专用).pdf" reader = PyPDF2.PdfFileReader(open(pdfpath, mode='rb')) n = reader.getNumPages() df = [] for page in range(18, 264): try: d = wrapper.read_pdf(pdfpath, pages=page) dfs = d.set_index( d['时间']).drop(columns=['时间', 'Unnamed: 0']).stack().reset_index() dfs.columns = ['节数', '星期', '课程信息'] dfs = dfs.dropna(how='any') dfs['课程名称'] = dfs['课程信息'].apply(lambda x: x.split('\r')[0]) dfs['时间范围'] = dfs['课程信息'].apply(lambda x: x.split('\r')[1]) dfs['地点'] = dfs['课程信息'].apply(lambda x: x.split('\r')[2]) save_dfto_csv('课表', dfs) #res = dfs.drop(columns='课程信息') # df.append(d) # d.to_excel('%d.xlsx') print(str(page) + 'success') except: print(str(page) + 'fail')
#!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd get_ipython().system('pip install tabula-py') from tabula import wrapper df = wrapper.read_pdf('C:/Users/ramaruth/Downloads/hdfc.pdf') # In[164]: df # In[165]: data = df[[ 'Date', 'Narration', 'Withdrawal Amt.', 'Deposit Amt.', 'Closing Balance' ]] # Here the null values of Narration doesn't provide any transacton details.Hence Deleting them # In[166]: data[data.Narration.isnull()] # In[167]: data = data.dropna(subset=['Narration'], axis=0, how='any')
def scrap(filename): # pdf file object # you can find find the pdf file with complete code in below pdfFileObj = open(filename, 'rb') #print(filepath) # pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # number of pages in pdf print(pdfReader.numPages) no_of_pages = pdfReader.numPages #This scrapes the PDF text = convert_pdf_to_txt(filename, pages=range(no_of_pages)) arr = text.split('\n') #We split by new line print(arr) trans_details = arr #We will retrieve the labels by finding the index of Beginning and Ending balance. st_index_labels = arr.index('Beginning Balance') end_index_labels = arr.index('Ending Balance') labels = arr[st_index_labels:end_index_labels + 1] print(labels) #find start index index = end_index_labels st = 'Ending Balance' while index < (len(arr) - 1): if arr[index] != '': if arr[index][0] == '$': st = arr[index] break index = index + 1 st print(index) st_index = index print(st_index) print(len(labels)) summary = (arr[st_index:st_index + len(labels)]) summary[len(summary) - 1] = summary[len(summary) - 1][1:] #Wrangle Strings to remove dollar sign and commas. Convert them to float to plot them summary[0] = summary[0][ 1:] # This removes dollar sign in the front, so we can use for plotting. summary = [s.replace(',', '') for s in summary] # This removes commas from the amount print(summary) #This scrapes data from dates = cleanDates(trans_details) print(len(dates)) months = getMonths(dates) months labels.insert(0, 'Month') summary.insert(0, months[0]) summary = [float(i) for i in summary] # converts the whole list into flots summary_dataframe = pd.DataFrame({'Labels': labels, 'Amount': summary}) summary_dataframe.to_csv( 'C:\\Users\\nidhi\\Desktop\\UCSC_Spring\\WebScraping\\Summary.csv', sep=',', mode='a', index=False) #Adjust the page number. #Test which is the start page where the transaction details are #If you give a page which does not have table then tabula will throw an exception print(summary_dataframe.info()) print(summary_dataframe.head(10)) startpage = 0 for page in range(no_of_pages): try: text = convert_pdf_to_txt(filename, pages=[page]) arr = text.split('\n') index = arr.index('*start*transaction detail') if (index >= 0): break except ValueError: startpage = startpage + 1 print(startpage) print("Nit") #from tabula import wrapper #for page in range(startpage,) #tables = wrapper.read_pdf("C:\\Users\\nidhi\\Desktop\\UCSC_Spring\\WebScraping\\s2.pdf",multiple_tables=True, pages=0) #print(len(tables[0][2])) #for table in tables: # for tab in range(len(table[0][2])): # print(tables[0][2][tab]) #os.path.abspath("E:/Documents/myPy/") #Strange observation: 1) Though pages all means iterating all the pages and multi tables true should help to accumulate multiple table. But it has only one # 2) All the first page where the transaction starts like s4,s6 does fine or has 0,1 as amounts otherwise it is on 0,2 balance = [] amount = [] description = [] #print(len(tables[0][1])) #print(tables[0][2][tab]) #for tab in range(len(tables[0][1])): # print(tables[0][1][tab]) for page in range(startpage, no_of_pages): print(page) try: tables = wrapper.read_pdf(filename, multiple_tables=True, pages=page + 1) print(tables[0][2]) if ((page) == startpage): try: for tab in range(len(tables[0][1])): amount.append(tables[0][1][tab]) except: print("Amount Hi") else: try: for tab in range(len(tables[0][2])): amount.append(tables[0][2][tab]) except: print("Amount Hi") try: for tab in range(len(tables[0][0])): description.append(tables[0][0][tab]) except: print(" Description Hi") if ((page) == startpage): try: for tab in range(len(tables[0][2])): balance.append(tables[0][2][tab]) except: print("Balance Hi") else: try: for tab in range(len(tables[0][3])): balance.append(tables[0][3][tab]) except: print("Balance Hi") #balance #cleanedList = [x for x in balance if str(x) != 'nan'] #cleanedlist = [x for x in balance if x != 'nan'] #cleanedList #amount print(amount) print(balance) except: print("table error") print(amount) cleanedAmount = cleanAmountAndBalance(amount) cleanedAmount = [float(i) for i in cleanedAmount] cleanedAmount len(cleanedAmount) cleanedBalance = cleanAmountAndBalance(balance) cleanedBalance = [float(i) for i in cleanedBalance] cleanedBalance = cleanedBalance[1:] print((cleanedBalance)) if len(cleanedBalance) > len(cleanedAmount): cleanedBalance = cleanedBalance[:len(cleanedBalance) - 1] len(cleanedBalance) cleanedDescription = cleanDescription(description) cleanedDescription len(cleanedDescription) bankStatementInfo = pd.DataFrame({ 'Dates': dates, 'Months': months, 'Description': cleanedDescription, 'Amount': cleanedAmount, 'Balance': cleanedBalance }) print(bankStatementInfo.info()) print(bankStatementInfo.head(10)) from pathlib import Path my_file = Path("bankStatements.csv") if my_file.is_file(): bankStatementInfo.to_csv('bankStatements.csv', index=False, header=False, sep=',', mode='a') else: # file exists bankStatementInfo.to_csv('bankStatements.csv', index=False, sep=',', mode='a') ism = re.match("^\d{2}\/\d{2}", '01/22') if ism: print("Hi") return "Hello Nits!!"
# encoding: utf-8 from tabula import wrapper df = wrapper.read_pdf("/Users/my/Documents/program/消费行业周报20190111.pdf") print(df) for indexs in df.index: # 遍历打印企业名称 print(df.loc[indexs].values[1].strip())
print('Loading file...', end="", flush=True) with open(file_name, 'wb') as fd: for chunk in r.iter_content(102400): fd.write(chunk) print('...', end="", flush=True) print('[DONE]') # In[5]: # wrapper.convert_into(file_name, (file_name + '.csv'), output_format='csv', pages='all') # In[6]: df = wrapper.read_pdf(file_name, pages='1700-1790') # In[7]: df.head() # In[8]: # df.columns = ['NR. DOSAR', 'DATA ÎNREGISTRĂRII', 'TERMEN', 'SOLUȚIE'] df.columns = ['NR. DOSAR', 'DATA INREGISTRARII', 'TERMEN', 'SOLUTIE'] # In[9]: # print(df.shape) # print(type(df.iloc[0,2]))