Пример #1
1
        f.write(resp.content)

from glob import glob
pdf_filenames = glob('CAWARN-*.pdf')
for pdf_fname in pdf_filenames:
    print("This is a filename of a pdf:", pdf_fname)

import csv
import pdfplumber

outfile = open('CAWARN12.csv', 'w')
outcsv = csv.writer(outfile)

pdf_fname12 = 'CAWARN-eddwarncn12.pdf'

pdf = pdfplumber.open(pdf_fname12)
page = pdf.pages

for page in pdf.pages:
    table = page.extract_table()
    for row in table[1:]:  
        outcsv.writerow(row)
outfile.close

outfile = open('CAWARN13.csv', 'w')
outcsv = csv.writer(outfile)

pdf_fname13 = 'CAWARN-eddwarncn13.pdf'

pdf = pdfplumber.open(pdf_fname13)
page = pdf.pages
def extract_title_main(file_name):
    title = ""
    subtitle = ""
    strapline = ""
    sub_strapline = ""
    main_text = ""
    check_list = ["0","1","2","3","4","5","6","7","8","9",","]
    with pdfplumber.open(os.getcwd()+ '/GICorpus2/'+file_name) as pdf:
    #총 페이지 수
        total_pages = pdf.pages
        for page in total_pages:
            total_char = page.chars
            charsize = len(total_char)
            for ch in range(charsize):
                #폰트크기가 Decimal로 저장되어 있기 때문에 int로 변환 필요
                temp = int(Decimal(total_char[ch].get("size")))
                
                if temp == 23: #제목
                    title = title + total_char[ch].get("text")

                elif temp == 16: #소제목
                    strapline = strapline+ total_char[ch].get("text")
                    if int(Decimal(total_char[ch+1].get("size"))) < 16:
                        strapline = strapline + "\t" 

                elif temp  == 11: #소소제목
                    sub_strapline = sub_strapline + total_char[ch].get("text")
                   
                    
                else : #본문 (> , < , , , 등등 폰트크기 10 아닌거 포함)
                    # 기울임체 제거 
                    if total_char[ch].get("upright") == False:
                        continue

                    # 줄바꿈 - 는 스킵   
                    if total_char[ch].get("text") == "-":
                        if( int(Decimal(total_char[ch].get("x0"))) >220) and (int(Decimal(total_char[ch].get("x1"))) >220):
                               continue

                    main_text = main_text + total_char[ch].get("text")
                     # . 뒤에 띄어쓰기가 없으면 문장 구분 못하는 문제 발견 --> . 나오면 뒤에 공백 추가
                     # 숫자, , 나올 때 제외함.
                    #인덱스에러 발생해서 추가 (ch + 1 때문..)
                    if ch< charsize -2:
                        if (total_char[ch].get("text")==".") and not (total_char[ch+1].get("text") in check_list):
                            main_text = main_text + " "
                        
    title = title + "."
    
    f = open(os.getcwd()+"/GICorpus2/" + file_name[0:-4]+".txt", "w")
    f.write(title)
    f.write("\n\n")
    f.write(main_text)
    f.close()
Пример #3
0
def extractLinesFromPDF(filepath, agency, date):
	print filepath
	with pdfplumber.open(filepath) as pdf:

		# init lines list
		lines = list()

		# loop over pages
		for page_index, page in enumerate(pdf.pages):
			# crop page
			page = cropHeaderAndFooter(page, page_index)

			# convert to a list of lines with formatting
			lines += getLinesWithFormatting(page, page_index, agency, date)

		# convert font information into a set of ranked dummy vars
		lines = cleanFontNames(lines)
		lines = assignFontStyles(lines)

		# bucket left indentations into 5 ranked dummy vars
		lines = bucketLeftIndentation(lines, agency)

		return lines
Пример #4
0
import pdfplumber
import os

#os.chdir(r"D:\\code\\python_project\\python_code\\get_pdf\\")

with pdfplumber.open(r"get_pdf\\延安高铁站.pdf") as pdf:
    pages = pdf.pages  # 第一页的信息
    for i in pages:
        text = i.extract_text()
        print(text)
        "text_y_tolerance": 0,
        "text_x_tolerance": 2,
    })

    table = pd.DataFrame([ [ month ] + row for row in _table ])

    table.columns = COLUMNS
    table[table.columns[2:]] = table[table.columns[2:]].applymap(parse_value)

    table.loc[(table["state"] == "llinois"), "state"] = "Illinois"
    table = table.loc[lambda df: df["state"].fillna("").str.strip() != ""]
    try: validate_data(table)
    except: raise Exception("Invalid data for " + month)

    return table

def parse_pdf(pdf):
    # Note: As of Nov. 2019 file, first page is documentation
    checks_gen = map(parse_page, pdf.pages[1:])
    checks = pd.concat(checks_gen).reset_index(drop=True)

    return checks[checks["state"] != "Totals"]

if __name__ == "__main__":
    with pdfplumber.open(sys.stdin.buffer) as pdf:
        checks = parse_pdf(pdf)

    checks.to_csv(sys.stdout, index=False, float_format="%.0f")

    sys.stderr.write("\r\n")
Пример #6
0
 def test_issue_21(self):
     pdf = pdfplumber.open(
         os.path.join(HERE, "pdfs/150109DSP-Milw-505-90D.pdf"))
     assert len(pdf.objects)
     pdf.close()
Пример #7
0
 def test_issue_203(self):
     path = os.path.join(HERE, "pdfs/issue-203-decimalize.pdf")
     with pdfplumber.open(path) as pdf:
         assert len(pdf.objects)
Пример #8
0
import pdfplumber
import csv

pdf_name = 'raw-pdfs/crime-stats-2013.pdf'
pdf = pdfplumber.open(pdf_name)

page = pdf.pages[32]
table = page.extract_table()

with open('2013.csv', 'w') as outfile:
	outcsv = csv.writer(outfile)

	for row in table:
		outcsv.writerow(row)
Пример #9
0
'https://www.nccommerce.com/Portals/11/Documents/Reports/WARN/warn-2015.pdf',
'https://www.nccommerce.com/Portals/11/WARN/Warn2014.pdf',
'https://www.nccommerce.com/Portals/11/WARN/Warn-2013.pdf'
]

for url in urls:
	pdf_fname = 'NCWARN-' + basename(url)
	print("Downloading", url, 'into', pdf_fname)
	resp = requests.get(url)
	with open(pdf_fname, 'wb') as f:
		f.write(resp.content)

pdf_filenames = glob('NCWARN-*.pdf')
for pdf_fname in pdf_filenames:
	print("This is a filename of a pdf:", pdf_fname)
	pdf = pdfplumber.open(pdf_fname)
	type(pdf)

# PDF 1
pdf_fname = 'NCWARN-Warn.pdf'

outfile = open('NCWARN-Warn.csv', 'w')
outcsv = csv.writer(outfile)

pdf = pdfplumber.open(pdf_fname)
for page in pdf.pages:
    table = page.extract_table()
    for row in table[1:]:  # note how I'm still skipping the header
        outcsv.writerow(row)
outfile.close
Пример #10
0
import csv

url_a = "https://jfs.ohio.gov/warn/"
url_b = ".stm"

NAMES = ["WARN_2015",
"WARN2014",
"WARN_2013",
"WARN_2012"]

for name in NAMES:
	fname_pdf = (name + ".pdf")
	url = (url_a + name + url_b)
	print("Downloading", url, "into", fname_pdf)
	resp = requests.get(url)
	with open(fname_pdf, "wb") as f:
		f.write(resp.content)
		f.close()


	outfile = open(name + ".csv", "w")
	outcsv = csv.writer(outfile)

	pdf = pdfplumber.open(fname_pdf)
	for page in pdf.pages:
		table = page.extract_table()
		for row in table[1:]:
			outcsv.writerow(row)
	outfile.close

Пример #11
0
 def test_text_colors(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     with pdfplumber.open(path) as pdf:
         char = pdf.pages[0].chars[3358]
         assert char["non_stroking_color"] == [1, 0, 0]
Пример #12
0
 def test_colors(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     with pdfplumber.open(path) as pdf:
         rect = pdf.pages[0].rects[0]
         assert rect["non_stroking_color"] == [0.8, 1, 1]
Пример #13
0
 def test_password(self):
     path = os.path.join(HERE, "pdfs/password-example.pdf")
     with pdfplumber.open(path, password="******") as pdf:
         assert len(pdf.chars) > 0
path = "./data/full/"
texts_path = "./data/texts/"

pathlib.Path(texts_path).mkdir(parents=True, exist_ok=True) 
files = glob.glob(path + "*")

for f in files:
    f_name = f.split("\\")[-1]
    txt_path = texts_path + f_name + ".txt"
    
    file_path = pathlib.Path(txt_path)
    
    if file_path.exists():
        continue

    pdf = pdfplumber.open(f)
    text = u''
    for page in pdf.pages:
        try: # since pdfplumber tries to convert "P14" into decimal for some reason
            et = page.extract_text()
        except:
            pass
        
        if et is not None: # since pdfplumber returns None when an empty page occurs
            text += et
        else:
            print("Extracted text is None in file " + f)

    file = open(txt_path, "wb")
    file.write(text.encode("utf-8", "ignore"))
    file.close()
Пример #15
0
    filename = 'warn-{}.pdf'.format(i)
    with open(filename, 'wb') as save_file:
        save_file.write(resp.content)
    print('Saved to', filename)
    filenames.append(filename)

################################################################################
# CSV building + counting
total = 0

with open('warn-2012-2014.csv', 'w') as outfile:
    outcsv = csv.writer(outfile)
    outcsv.writerow(['Company Name', 'Location', 'Employees\nAffected',
                     'Layoff\nDate']) # Manually write header
    for filename in filenames[3:]:
        pdf = pdfplumber.open(filename)
        for i, page in enumerate(pdf.pages):
            print ('Extracting page', i + 1, 'from', filename)
            table = page.extract_table()
            for row in table:
                if i == 0: continue # Skip header on first page of each doc
                try:
                    total += int(row[2])
                except:
                    print('Couldn\'t get num employees from', row)
                outcsv.writerow(row)
print(total, 'employeed affected from 2012-2014 dataset')

# Don't count totals from here, because precalculated in pdf
# These pdfs don't seem to have their tables extracted as well...
with open('warn-2014-2016.csv', 'w') as outfile:
Пример #16
0
"""
Run pdfplumber on sample PDFs.

Install with `pip install pdfplumber`.
"""
import pdfplumber

with open("pdfplumber.out", "w") as f:
    with pdfplumber.open("pdfs/1412.6980.pdf") as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            f.write(text)
Пример #17
0
# =============================================================================
# 10.2 PDF文本解析基础 by 王宇韬
# =============================================================================

# 1.解析第一页的文本信息
import pdfplumber
pdf = pdfplumber.open('公司A理财公告.PDF')  # 打开PDF文件
pages = pdf.pages  # 通过pages属性获取所有页的信息,此时pages是一个列表
page = pages[0]  # 获取第一页内容
text = page.extract_text()  # 通过
print(text)  # 打印第一页内容
pdf.close()  # 关闭PDF文件

# 2.解析全部页数的文本信息
import pdfplumber
pdf = pdfplumber.open('公司A理财公告.PDF')
pages = pdf.pages
text_all = []
for page in pages:  # 遍历pages中每一页的信息
    text = page.extract_text()  # 提取当页的文本内容
    text_all.append(text)  # 通过列表.append()方法汇总每一页内容
text_all = ''.join(text_all)  # 把列表转换成字符串
print(text_all)  # 打印全部文本内容
pdf.close()
Пример #18
0
 def test_load_with_custom_laparams(self):
     # See https://github.com/jsvine/pdfplumber/issues/168
     path = os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf")
     laparams = dict(line_margin=0.2)
     with pdfplumber.open(path, laparams=laparams) as pdf:
         assert float(pdf.pages[0].chars[0]["top"]) == 66.384
Пример #19
0
 def setUp(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     self.pdf = pdfplumber.open(path)
Пример #20
0
 def Fin_Analysis_Mian(self, results):
     #获取参数
     for param in results:
         bond_company = param[0]
         keyword = param[1]
         whether_cross_page = param[2]
         whether_around = param[3]
         print("证券公司名称:" + bond_company)
         pdf = pdfplumber.open('D:/年报/'+bond_company+'.pdf')
         table = []
         list = []
         for page in pdf.pages[1:400]:
             data = page.extract_text()
             if keyword in data:
                 page_number = page.page_number
                 print('开始读取数据页数:' + str(page_number))
                 for i in range(page_number - 2, page_number + 2):
                     page_use = pdf.pages[i]
                     for table_list in page_use.extract_tables():
                         table.append(table_list)
         #判断是指标
         if keyword.endswith('结算备付金'):
             #获取所有的表格
             for i in range(0, len(table[0:]) - 1):
                 #获取范围类的每个表格
                 for j in range(0, len(table[0:][i])):
                     #判断是否跨页 2有跨页
                     if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('客户备付金') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有备付金') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有'):
                         list.append(table[0:][i])
                     if whether_cross_page == 2:
                         #跨页在后为1
                         if whether_around == 1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('公司自有'):
                                 list.append(table[0:][i+1])
                         #跨页在前为-1
                         elif whether_around == -1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('客户信用备'):
                                 list.append(table[0:][i-1])
         elif keyword.endswith('融出资金'):
             for i in range(0, len(table[0:]) - 1):
                 for j in range(0, len(table[0:][i])):
                     if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('减:减值') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('个人') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('1-3个月') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('资金'):
                         list.append(table[0:][i])
                     if whether_cross_page == 2:
                         if whether_around == 1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'):
                                 list.append(table[0:][i+1])
                         elif whether_around == -1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'):
                                 list.append(table[0:][i-1])
         elif keyword.endswith('衍生金融工具'):
             for i in range(0, len(table[0:]) - 1):
                 for j in range(0, len(table[0:][i])):
                     if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('商品期货') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].endswith('生工具') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('权益\n互换') or \
                             table[0:][i][j][0] != None and table[0:][i][j][0].startswith('资金'):
                         list.append(table[0:][i])
                     if whether_cross_page == 2:
                         if whether_around == 1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'):
                                 list.append(table[0:][i+1])
                         elif whether_around == -1:
                             if table[0:][i][j][0] != None and table[0:][i][j][0].startswith('6个月'):
                                 list.append(table[0:][i-1])
         pdf.close()
         #去除重复list
         new_list = []
         for i in list:
             if i not in new_list:
                 new_list.append(i)
         #遍历列表
         for table in new_list:
             for row in table:
                 row.append(bond_company)
                 row.append(date)
                 print(row)
         return bond_company
Пример #21
0
def parse_pdf(path, year):
    with pdfplumber.open(path) as pdf:
        df = pd.concat([ parse_page(page, year) for page in pdf.pages ])
    return df
Пример #22
0
def get_report_startpage(pdf):
    """获取财务报表在文件内的起始页
    Arguments:
        pdf {[str]} -- [pdf文件路径]
    Returns:
        start_page[int] -- [业务报表的起始页]
    """
    getp = pdfplumber.open(pdf)
    total = len(getp.pages)
    #用于判断当前页是否在前10页
    count = 0
    #存储报表的起始页
    start_page = 0
    #是否是年度报告之类的文件标志
    flag = False
    #创建一个pdf资源管理对象,存储共享资源
    if total > 30:
        print('总页数', total)
        #遍历pdf中的每一页
        for page in getp.pages:
            count += 1
            teststr = page.extract_text()
            if teststr is None:
                return 0

            #第一页有无年/季度报告文字,若没有,则无需查找起始页
            rs = re.search('(年\s*|季\s*)度?\s*报\s*告?', teststr)
            #print(teststr)
            if rs != None and count == 1:
                #第一张找到年报相关文字,在下一页查找目录二字
                flag = True
                continue
            elif rs == None and count == 1:
                #第一页未找到年/季报相关文字,查找第二页
                #有的报告第一张具有印章,导致提取文字不全
                print('第1页未检测到年/季报等文字,检测第二页')
                continue
            elif rs != None and count == 2:
                #第二页找到了年报相关文字,在第三页查找目录
                flag = True
                continue
            elif rs == None and count == 2:
                #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件
                if flag == False:
                    print('当前文件的财务报表起始页为', start_page)
                    return start_page
            #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页
            if flag == True:
                #1 对前10页进行处理
                if count < 11:
                    #查找目录页
                    if re.search('目\s*录', teststr, flags=0):

                        #查看含有目录两字的当前页中是否具有财务报表相关的目录名
                        ret = re.search('财务报告\s*(.)*\d',
                                        teststr)  #??????????可能有问题lhj
                        if ret != None:
                            ret = ret.group()
                            #去除空格
                            tstr = [
                                y.strip() for y in re.split(r'[…¨ .]', ret)
                                if len(y) != 0
                            ]
                            #第一个值未目录名,第二个值为页码
                            start_page = int(tstr[1])
                            print('当前文件的财务报表起始页为', start_page)
                            return start_page
                        else:
                            #含有目录两字的当前页未找到财务报表相关文字,对下一页处理
                            count += 1
                            continue
                    else:
                        #当前页未找到目录文字,继续判断下一页
                        print('第', count, '页未找到目录二字,查找下一页')
                        continue
                else:
                    print('10页内未找到目录二字')
                    #10页内未找到目录页,则退出循环
                    break

    else:
        #不超过30页不处理
        print('当前文件的财务报表起始页为', start_page)
        return start_page

    print('当前文件的财务报表起始页为', start_page)
    return start_page
Пример #23
0
 def test_issue_140(self):
     path = os.path.join(HERE, "pdfs/issue-140-example.pdf")
     with pdfplumber.open(path) as pdf:
         page = pdf.pages[0]
         cropped_page = page.crop((0, 0, page.width, 122))
         assert len(cropped_page.extract_table()) == 5
Пример #24
0
def main():
    dbs_source_dir = Path("/Users/jeromeko/Desktop/2020_Bank_Statements/DBS")
    uob_source_dir = Path("/Users/jeromeko/Desktop/2020_Bank_Statements/UOB")
    dest_csv = Path("/Users/jeromeko/Desktop/2020_Bank_Statements")

    dbs_all_txns = []
    uob_all_txns = []

    for folder, subfolder, pdf_files in os.walk(dbs_source_dir):
        for pdf_file in pdf_files:

            with pdfplumber.open(dbs_source_dir / pdf_file) as pdf:
                for i in range(2):  # txns only extend up to 2nd page
                    page_text = pdf.pages[i].extract_text()
                    sub_total_bool, sub_total_content = contains_sub_total(
                        pdf.pages[0].extract_text())

                    if i == 0:
                        txns_raw = txn_trimming(
                            page_text, "NEW TRANSACTIONS JEROME KO JIA JIN")
                        dbs_all_txns.append(
                            process_txn_amt(filter_legitimate_txns(txns_raw)))

                    elif i == 1 and not sub_total_bool:  # if txns extend to 2nd page
                        txns_raw = txn_trimming(page_text, "2 of 3")
                        dbs_all_txns.append(
                            process_txn_amt(filter_legitimate_txns(txns_raw)))

    for folder, subfolder, pdf_files in os.walk(uob_source_dir):
        for pdf_file in pdf_files:

            with pdfplumber.open(uob_source_dir / pdf_file) as pdf:
                for i in range(2):  # txns only extend up to 2nd page
                    page_text = pdf.pages[i].extract_text()
                    sub_total_bool, sub_total_content = contains_sub_total(
                        pdf.pages[0].extract_text())

                    if i == 0:
                        txns_raw = txn_trimming(page_text, "PREVIOUS BALANCE")
                        uob_all_txns.append(
                            process_txn_amt(filter_legitimate_txns(txns_raw)))

                    elif i == 1 and not sub_total_bool:  # if txns extend to 2nd page
                        txns_raw = txn_trimming(page_text, "Date Date SGD")
                        uob_all_txns.append(
                            process_txn_amt(filter_legitimate_txns(txns_raw)))

    for monthly_txns in uob_all_txns:
        for txn in monthly_txns:
            del txn[0:2]  # remove post dates

    all_txns = dbs_all_txns.copy()
    all_txns.extend(uob_all_txns)

    # Represent txns according to dates, desc and amt
    categorized_txns = [{
        "Date": " ".join(txn[0:2]),
        "Txn Desc": " ".join(txn[2:len(txn) - 1]),
        "Amt": txn[-1]
    } for monthly_txns in all_txns for txn in monthly_txns]

    # Load into dataframe for further manipulation
    df_categorized_txns = pd.DataFrame(categorized_txns)

    # Format date column
    df_categorized_txns["Date"] = df_categorized_txns["Date"] + " 2020"

    # Categorizing txns
    df_categorized_txns["Category"] = df_categorized_txns.apply(
        categorize_txns, axis=1)

    # Write into csv
    # df_categorized_txns.to_csv(dest_csv / "2020 transactions.csv")
    df_categorized_txns.to_csv(dest_csv / "2020 transactions test.csv")
Пример #25
0
 def test_issue_14(self):
     pdf = pdfplumber.open(
         os.path.join(HERE, "pdfs/cupertino_usd_4-6-16.pdf"))
     assert len(pdf.objects)
     pdf.close()
def pdf2txt(txt_path, pdf_path, img_path):
    print("\n[PDF to TXT 변환 시작] 슬라이드 이미지를 텍스트로 변환을 시작합니다")

    # 디렉토리 유무 검사 및 디렉토리 생성
    try:
        if not os.path.exists(txt_path):  # 디렉토리 없을 시 생성
            os.makedirs(txt_path)
    except OSError:
        print('Error: Creating directory. ' + txt_path)  # 디렉토리 생성 오류

    textt = ""
    textt1 = ""
    txt_res = ""
    table_final_text = []
    text_com = ""
    table_list = []
    a = 1
    b = 1

    Pdf = pdfplumber.open(pdf_path)

    for page_idx, page in enumerate(Pdf.pages):
        txtFile = open(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt",
                       "w", -1, "utf-8")  # 번역한 내용을 저장할 텍스트 파일

        txtFile.write(str(page_idx + 1) + "번 슬라이드 해설 시작" + "\n" + "\n")

        # 텍스트->table
        result = page.extract_text()
        text = str(page.extract_text())
        # text = text.replace('\n'," ")
        text = re.sub('\\n+', '\n', text)
        text = text + "\n"

        for table in page.extract_tables():
            for row in table:
                for column in range(0, len(row)):
                    text_com = text_com + row[column] + " "
                    textt = str(a) + "행"
                    textt1 = " " + str(b) + "열 " + row[column] + "\n"
                    txt_res = txt_res + textt + textt1

                    b = b + 1
                b = 1
                a = a + 1
                text_com = text_com[:-1]
                text_com = text_com + "\n"
            table_new = '표 시작\n' + txt_res + '표 끝 \n'
            table_final_text.append(table_new)
            table_list.append(text_com)
            # print(text_com)
            txt_res = ""
            text_com = ""
            a = 1
            b = 1

        # 공백 O
        for i, j in zip(table_list, table_final_text):
            text = text.replace(i, j)

        imgcaption = imgExtract(page_idx, text, pdf_path, img_path)

        if (imgcaption == "이미지 없음"):
            print("이미지 없음")
            txtFile.write(text + "\n")
        else:
            imgcaption = "".join(imgcaption)
            txtFile.write(text + imgcaption + "\n")

        txtFile.close()

        # 텍스트 변환 필터링
        NLP(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt")

        # 이미지 캡션 위치 조정
        modifytxt(txt_path + set_Filenum_of_Name(page_idx + 1) + ".txt",
                  page_idx)

        print(">>>", page_idx + 1, "번째 PDF 슬라이드 텍스트 변환 완료")

    Pdf.close()
    print("[PDF to TXT 변환 종료] 슬라이드 이미지를 텍스트로 변환을 종료합니다\n")
Пример #27
0
 def test_issue_33(self):
     pdf = pdfplumber.open(
         os.path.join(HERE, "pdfs/issue-33-lorem-ipsum.pdf"))
     assert len(pdf.metadata.keys())
     pdf.close()
Пример #28
0
import pdfplumber

ms = [
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August',
    'September', 'October', 'November', 'December'
]

for year in range(2004, 2014):
    for month in ms:
        s = '01pdf\\' + month + str(year) + 'ChiefsDirectory'
        with open(s + '.txt', 'w', encoding="utf-8") as fw:
            print('...')
            with pdfplumber.open(s + '.pdf') as pdf:
                for page in pdf.pages:
                    fw.write(page.extract_text())
Пример #29
0
 def __init__(self, path):
     self.pdf = pdfplumber.open(path)
     self.draw = False
     self.debug = False
Пример #30
0
import pdfplumber
import pyttsx3 as speech
pdf = pdfplumber.open('half.pdf')
pages = pdf.pages[16:]
speaker = speech.init()
for i, pg in enumerate(pages):
    text = pages[i].extract_text()
    speaker.say(text)
    speaker.runAndWait()
pdf.close()
Пример #31
0
from PIL import Image 
import pytesseract 
import sys 
import os
import pdfplumber

pdf_file = "source.pdf"
txt_file = "target.txt"
tempLoc = "tempPages/"
pageName = "page"
doc = ''

pytesseract.pytesseract.tesseract_cmd ='C:/Program Files/Tesseract-OCR/tesseract.exe'

with pdfplumber.open(pdf_file) as pdf:
    with open (txt_file, "w", encoding="utf-8") as outFile :
        for page in pdf.pages :
            # best way to get text is if the doc has the text component
            # already encoded (this only occurs if it was not scanned)
            doc = page.extract_text()
            # if we can't get the text the easy way, try the best we
            # can using OCR extraction of the text 
            if (not doc) :
                im = page.to_image(resolution=512)
                filename = tempLoc+pageName+".png"
                im.save(filename, format="PNG")
                doc = str(((pytesseract.image_to_string(Image.open(filename)))))
            outFile.write (doc) 

Пример #32
0
 def test_pr_77(self):
     # via https://github.com/jsvine/pdfplumber/pull/77
     path = os.path.join(HERE, "pdfs/pr-77-example.pdf")
     with pdfplumber.open(path) as pdf:
         first_page = pdf.pages[0]
         first_page.objects
Пример #33
0
 def setup_class(self):
     path = os.path.join(HERE, "pdfs/nics-background-checks-2015-11.pdf")
     self.pdf = pdfplumber.open(path)
     self.PDF_WIDTH = self.pdf.pages[0].width
Пример #34
0
    def test_issue_13(self):
        """
        Test slightly simplified from gist here: https://github.com/jsvine/pdfplumber/issues/13
        """
        pdf = pdfplumber.open(
            os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf"))

        # Only find checkboxes this size
        RECT_WIDTH = 9.3
        RECT_HEIGHT = 9.3
        RECT_TOLERANCE = 2

        def filter_rects(rects):
            ## Just get the rects that are the right size to be checkboxes
            rects_found = []
            for rect in rects:
                if (rect['height'] > (RECT_HEIGHT - RECT_TOLERANCE)
                        and (rect['height'] < RECT_HEIGHT + RECT_TOLERANCE)
                        and (rect['width'] < RECT_WIDTH + RECT_TOLERANCE)
                        and (rect['width'] < RECT_WIDTH + RECT_TOLERANCE)):
                    rects_found.append(rect)
            return rects_found

        def determine_if_checked(checkbox, curve_list):
            # This figures out if the bounding box of (either) line used to make
            # one half of the 'x' is the right size and overlaps with a rectangle.
            # This isn't foolproof, but works for this case.
            # It's not totally clear (to me) how common this style of checkboxes
            # are used, and whether this is useful approach to them.
            # Also note there should be *two* matching LTCurves for each checkbox.
            # But here we only test there's at least one.

            for curve in curve_list:

                if (checkbox['height'] > (RECT_HEIGHT - RECT_TOLERANCE)
                        and (checkbox['height'] < RECT_HEIGHT + RECT_TOLERANCE)
                        and (checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE)
                        and (checkbox['width'] < RECT_WIDTH + RECT_TOLERANCE)):

                    xmatch = False
                    ymatch = False

                    if (max(checkbox['x0'], curve['x0']) <= min(
                            checkbox['x1'], curve['x1'])):
                        xmatch = True
                    if (max(checkbox['y0'], curve['y0']) <= min(
                            checkbox['y1'], curve['y1'])):
                        ymatch = True
                    if xmatch and ymatch:
                        return True

            return False

        p0 = pdf.pages[0]
        curves = p0.objects["curve"]
        rects = filter_rects(p0.objects["rect"])

        n_checked = sum([determine_if_checked(rect, curves) for rect in rects])

        assert (n_checked == 5)
        pdf.close()
Пример #35
0
import json

PATH_TO_FILE = "/Users/healthi/Downloads/HARSHITHA.PDF"
END_OF_PAGE = "---End Of Report---"


def pretty(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            pretty(value, indent + 1)
        else:
            print('\t' * (indent + 1) + str(value))


pdf = pdfplumber.open(PATH_TO_FILE)
#p0 = pdf.pages[4] #works
page_to_parse = [8, 9]
final_result = {}
header_name = [
    "CREATININE, SERUM", "LIPID PROFILE", "LIVER FUNCTION TESTS (LFT)",
    "UREA - SERUM / PLASMA"
]
test_name = [
    "CREATININE-SERUM/PLASMA", "CHOLESTEROL", "HDL", "TRIGLYCERIDES", "LDL",
    "VLDL", "TOTALCHOLESTEROLHDLCHOLESTEROLRATIO", "BILIRUBINTOTAL", "ALBUMIN",
    "A/GRatio", "AST(SGOT)", "ALT(SGPT)", "ALKALINEPHOSPHATASE",
    "GAMMAGLUTAMYLTRANSPEPTIDASE", "BILIRUBINCONJUGATED(DIRECT)", "UREA,SERUM",
    "URICACID-SERUM"
]
header = ""
Пример #36
0
 def test_issue_53(self):
     pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-53-example.pdf"))
     assert len(pdf.objects)
     pdf.close()
        if len(filtered.chars) == 0:
            continue
        tops = [c["top"] for c in filtered.chars]
        cropped = filtered.crop((0, min(tops) - 2, filtered.width, max(tops) + 6))
        rows = cropped.extract_table(x_tolerance=1, y_tolerance=1)
        table += rows

    data = pd.DataFrame(table)
    if len(data.columns) == 6:
        data.columns = COLUMNS
    else:
        data.columns = ["sivigila_code"] + COLUMNS
    data = data.drop_duplicates().reset_index(drop=True)
    data[INT_COLS] = data[INT_COLS].astype(int)
    data["department"] = data["department"].str.strip().str.upper().apply(lambda x: DEPT_FIXES.get(x, x))
    data["sivigila_code"] = data["sivigila_code"].str.strip()
    data["municipality"] = data["municipality"].str.strip().str.upper().apply(lambda x: MUNI_FIXES.get(x, x))

    sums = data[INT_COLS].sum(axis=1)
    equalities = (sums == (data["zika_total"] * 2)).unique().tolist()
    assert equalities == [True]
    return data


if __name__ == "__main__":
    import sys

    with pdfplumber.open(sys.argv[1]) as pdf:
        data = parse(pdf)
        data.to_csv(sys.stdout, index=False, encoding="utf-8")
Пример #38
0
 def test_issue_67(self):
     pdf = pdfplumber.open(os.path.join(HERE, "pdfs/issue-67-example.pdf"))
     assert len(pdf.metadata.keys())
     pdf.close()
Пример #39
0
 def test_issue_90(self):
     path = os.path.join(HERE, "pdfs/issue-90-example.pdf")
     with pdfplumber.open(path) as pdf:
         page = pdf.pages[0]
         page.extract_words()
Пример #40
0
 def test_pr_136(self):
     path = os.path.join(HERE, "pdfs/pr-136-example.pdf")
     with pdfplumber.open(path) as pdf:
         page = pdf.pages[0]
         page.extract_words()
#!/usr/bin/env python
# Note: Some Python best-practices have been sacrificed below for simplicity's sake.
import pdfplumber
import sys, os

COLUMNS = [ "state", "permit", "handgun", "long_gun", "other", "multiple", "admin", "prepawn_handgun", "prepawn_long_gun", "prepawn_other", "redemption_handgun", "redemption_long_gun", "redemption_other", "returned_handgun", "returned_long_gun", "returned_other", "rentals_handgun", "rentals_long_gun", "private_sale_handgun", "private_sale_long_gun", "private_sale_other", "return_to_seller_handgun", "return_to_seller_long_gun", "return_to_seller_other", "totals" ]

pdf_path = os.path.join(sys.argv[1])

pdf = pdfplumber.open(pdf_path)

first_page = pdf.pages[0]

cropped = first_page.crop((0, 80, first_page.width, 485))

table = cropped.extract_table(
    v="lines",
    h="gutters",
    x_tolerance=5,
    y_tolerance=5
)

print("\t".join(COLUMNS))
for row in table:
    cols = [ (row[i] or "") for i in range(len(COLUMNS)) ]
    print("\t".join(cols).replace(",", ""))
Пример #42
0
    def test_issue_13(self):
        """
        Test slightly simplified from gist here:
        https://github.com/jsvine/pdfplumber/issues/13
        """
        pdf = pdfplumber.open(
            os.path.join(HERE, "pdfs/issue-13-151201DSP-Fond-581-90D.pdf"))

        # Only find checkboxes this size
        RECT_WIDTH = 9.3
        RECT_HEIGHT = 9.3
        RECT_TOLERANCE = 2

        def filter_rects(rects):
            # Just get the rects that are the right size to be checkboxes
            rects_found = []
            for rect in rects:
                if (rect["height"] > (RECT_HEIGHT - RECT_TOLERANCE)
                        and (rect["height"] < RECT_HEIGHT + RECT_TOLERANCE)
                        and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE)
                        and (rect["width"] < RECT_WIDTH + RECT_TOLERANCE)):
                    rects_found.append(rect)
            return rects_found

        def determine_if_checked(checkbox, checklines):
            """
            This figures out if the bounding box of (either) line used to make
            one half of the 'x' is the right size and overlaps with a rectangle.
            This isn't foolproof, but works for this case.
            It's not totally clear (to me) how common this style of checkboxes
            are used, and whether this is useful approach to them.
            Also note there should be *two* matching LTCurves for each checkbox.
            But here we only test there's at least one.
            """

            for cl in checklines:

                if (checkbox["height"] > (RECT_HEIGHT - RECT_TOLERANCE)
                        and (checkbox["height"] < RECT_HEIGHT + RECT_TOLERANCE)
                        and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)
                        and (checkbox["width"] < RECT_WIDTH + RECT_TOLERANCE)):

                    xmatch = False
                    ymatch = False

                    if max(checkbox["x0"], cl["x0"]) <= min(
                            checkbox["x1"], cl["x1"]):
                        xmatch = True
                    if max(checkbox["y0"], cl["y0"]) <= min(
                            checkbox["y1"], cl["y1"]):
                        ymatch = True
                    if xmatch and ymatch:
                        return True

            return False

        p0 = pdf.pages[0]
        checklines = [
            line for line in p0.lines
            if round(line["height"], 2) == round(line["width"], 2)
        ]  # These are diagonals
        rects = filter_rects(p0.objects["rect"])

        n_checked = sum(
            [determine_if_checked(rect, checklines) for rect in rects])

        assert n_checked == 5
        pdf.close()
Пример #43
0
        elif index > 2:
            for row in data:
                if row[4] is not None:
                    writer.writerow([row[2],row[4],row[6],row[8],row[10],row[12],row[14]])

        else:
            for row in data:
                writer.writerow(row)

jobs_lost = 0


#Loop through each PDF file
for index, doc in enumerate(all_pdfs):
    master_table = []
    pdf = pdfplumber.open(doc)
    fname = doc[:len(doc) - 3] + 'csv'

    # Check if this is the first of three files whose
    # format is the same. Otherwise, use other format
    for i in range(len(pdf.pages)):
        page = pdf.pages[i]
        table = page.extract_table()

        for row in table:
            if index <= 2:
                master_table.append(row)

            elif index is not 4:
                if row[4] is not None:
                    master_table.append(row)
Пример #44
0
url = 'http://www2.illinoisworknet.com/DownloadPrint/December%202015%20Monthly%20WARN%20Report.pdf'


pdf_fname = 'ILWARN-' + basename(url)
print("Downloading", url, 'into', pdf_fname)
resp = requests.get(url)
with open(pdf_fname, 'wb') as f:
    f.write(resp.content)

from glob import glob
pdf_filename = glob('ILWARN-*.pdf')
for pdf_fname in pdf_filename:
    print("This is a filename of a pdf:", pdf_fname)

import csv
import pdfplumber

outfile = open('ILWARN.csv', 'w')
outcsv = csv.writer(outfile)

pdf_fnameDEC = 'ILWARN-December%202015%20Monthly%20WARN%20Report.pdf'

pdf = pdfplumber.open(pdf_fnameDEC)
page = pdf.pages

for page in pdf.pages:
    table = page.extract_table()
    for row in table[1:]:  
        outcsv.writerow(row)
outfile.close