/
parse_ks.py
243 lines (238 loc) · 10.6 KB
/
parse_ks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name:
Description :
Author : ianchen
date:
-------------------------------------------------
Change Activity:
2017/11/22:
-------------------------------------------------
"""
import json
import re
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
def parse_ndpdf(pdf_path):
fp = open(pdf_path, "rb")
# 用文件对象创建一个pdf文档分析器
parse_pdf = PDFParser(fp)
# 创建一个PDF文档
doc = PDFDocument()
parse_pdf.set_document(doc)
doc.set_parser(parse_pdf)
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF参数分析器
laparams = LAParams()
# 创建聚合器
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF页面解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 循环遍历列表,每次处理一页的内容
# doc.get_pages() 获取page列表
for page in doc.get_pages():
# 使用页面解释器来读取
interpreter.process_page(page)
# 使用聚合器获取内容
layout = device.get_result()
results_last = ""
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
a = 0
gd = []
zj = []
hm = []
xingzhi = []
bili = []
guoji = []
for out in layout:
# 判断是否含有get_text()方法,图片之类的就没有
# if hasattr(out,"get_text"):
a += 1
if isinstance(out, LTTextBoxHorizontal):
results = out.get_text()
# 解析亏损表
if a == 1:
if results != "A106000企业所得税弥补亏损明细表\n" and results != "中华人民共和国企业所得税年度纳税申报表(A类)\n" and results != "A000000企业基础信息表\n":
break
else:
biaoge = results
gd = False
# print(results)
# results_last = results
if biaoge == "A106000企业所得税弥补亏损明细表\n" and results_last == '前五年度\n前四年度\n前三年度\n前二年度\n前一年度\n本年度\n可结转以后年度弥补的亏损额合计\n':
nf = results.strip("").split("\n")
print(nf)
if biaoge == "A106000企业所得税弥补亏损明细表\n":
if results_last == '2\n' or results_last == "2011\n2012\n2013\n2014\n2015\n2016\n":
nstzhs = results.strip("").split("\n")
if len(nstzhs) == 7:
nstzhsd = nstzhs
print(nstzhsd)
# 解析年度纳税申报表
if biaoge == "中华人民共和国企业所得税年度纳税申报表(A类)\n":
if results_last == '金额\n' and a == 11:
sz = results.strip("").split("\n")
print(sz)
elif a == 10 and "%" in results and "0.00" in results:
sz = results.strip("").split("\n")
print(sz)
# 解析基础信息表
if biaoge == "A000000企业基础信息表\n":
if "备抵法" in results or "直接核销法" in results:
cbjj = results.strip("").split("\n")
print(cbjj)
if biaoge == "A000000企业基础信息表\n" and a == 8:
kjzz = results.strip("").split("\n")
try:
# match = re.search(r'201适用的会计准则或会计制度 (.*?)', kjzz[0])
# print(match.group(1))
kjzzz = kjzz[0].split(" ")
kjzzzd = kjzzz[1]
print(kjzzzd)
except:
kjzzzd = ""
print(kjzzzd)
if biaoge == "A000000企业基础信息表\n" and "否" in results:
jcx = results.strip("").split("\n")
if len(jcx) == 6:
jcxx = jcx
print(jcxx)
else:
continue
if biaoge == "A000000企业基础信息表\n" and "301企业主要股东" in results:
gd = True
gdxx = []
if biaoge == "A000000企业基础信息表\n" and gd:
if "证件" not in results and "主要股东" not in results and "经济性质" not in results and "投资比例" not in results and "国籍" not in results and "302中国境内" not in results and "公司财务室" not in results \
and "备抵法" not in results and "直接核销法" not in results and "人民币" not in results:
gdxx.append(results)
results_last = results
pdf_dict = {}
try:
pdf_dict['所属行业明细'] = jcxx[2]
pdf_dict['从业人数'] = jcxx[3]
pdf_dict['存货计价方法'] = cbjj[1]
pdf_dict['企业会计准则为'] = kjzzzd
if "一般企业" in pdf_dict['企业会计准则为']:
pdf_dict['企业会计准则为'] = "一般企业会计准则"
except Exception as e:
print(e)
pdf_dict['所属行业明细'] = ""
pdf_dict['从业人数'] = ""
pdf_dict['存货计价方法'] = ""
pdf_dict['企业会计准则为'] = ""
try:
index = 0
for gl in gdxx:
index += 1
if "居民身份证" in gl or "营业执照" in gl:
zjhm = gl.replace("\n", "")
zjhm = zjhm.split('居民身份证')[1:]
clean = []
for g in zjhm:
if "营业执照" in g:
yy = g.split("营业执照")
if len(yy[0]) != 0:
clean.append("居民身份证")
clean.append(yy[0])
for zz in yy[1:]:
clean.append("营业执照")
clean.append(zz)
else:
clean.append("居民身份证")
clean.append(g)
break
tzxx = []
end = index + len(clean)
for tz in gdxx[index:end]:
tz = tz.replace("\n", "")
tzxx.append(tz)
gj = []
end2 = end + int(len(clean) / 2)
for country in gdxx[end:end2]:
country = country.replace("\n", "")
gj.append(country)
xm = []
gs = int(len(clean) / 2)
if index - 1 == gs:
for mc in gdxx[:index - 1]:
mc = mc.replace("\n", "")
xm.append(mc)
else:
for mc in gdxx[:index - 1]:
mc = mc.replace("\n", "")
xm.append(mc)
for mc in gdxx[end2:]:
mc = mc.replace("\n", "")
xm.append(mc)
zhenghe = {}
sb = 0
for j in range(0, len(clean), 2):
gdxxdict = {}
if '其他单位证件' in clean[j]:
gdxxdict["证件种类"] = "居民身份证"
else:
gdxxdict["证件种类"] = clean[j]
gdxxdict["证件号码"] = clean[j + 1]
gdxxdict["经济性质"] = tzxx[j]
gdxxdict["投资比例"] = tzxx[j + 1]
if "中华人民" in gj[sb] or "香港" in gj[sb]:
gdxxdict["国籍"] = "中国"
else:
gdxxdict["国籍"] = gj[sb]
gdxxdict["股东名称"] = xm[sb]
wc = gdxxdict
sb += 1
zhenghe["{}".format(sb)] = wc
pdf_dict['股东信息'] = zhenghe
tzfxx2, tzfxx3, tzfxx4, tzfxx5, tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10 = {}, {}, {}, {}, {}, {}, {}, {}, {}
tzfxx1 = json.dumps(zhenghe, ensure_ascii=False)
tzfxx2 = json.dumps(tzfxx2, ensure_ascii=False)
tzfxx3 = json.dumps(tzfxx3, ensure_ascii=False)
tzfxx4 = json.dumps(tzfxx4, ensure_ascii=False)
tzfxx5 = json.dumps(tzfxx5, ensure_ascii=False)
tzfxx6 = json.dumps(tzfxx6, ensure_ascii=False)
tzfxx7 = json.dumps(tzfxx7, ensure_ascii=False)
tzfxx8 = json.dumps(tzfxx8, ensure_ascii=False)
tzfxx9 = json.dumps(tzfxx9, ensure_ascii=False)
tzfxx10 = json.dumps(tzfxx10, ensure_ascii=False)
# params = (
# self.batchid, "0", "0", self.companyid, self.customerid, tzfxx1, tzfxx2, tzfxx3, tzfxx4, tzfxx5,
# tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10)
# self.insert_db("[dbo].[Python_Serivce_GSTaxInfo_AddParent]", params)
except:
pass
pdf_dict['纳税调整后所得'] = sz[18]
ksmx = {}
try:
for i in range(len(nf) - 1):
try:
if nf[i] == "2016":
ksmx[nf[i]] = sz[18]
else:
ksmx[nf[i]] = nstzhsd[i]
except:
ksmx[nf[i]] = nstzhsd[i]
except:
print("ksmx")
pdf_dict["亏损明细"] = ksmx
print(pdf_dict)
return pdf_dict
# parse_ndpdf('年度申报表详情10024417000014718405.pdf')
# parse_ndpdf('年度申报表详情10024417000014709743.pdf')
# parse_ndpdf("年度申报表详情10024417000013589901.pdf")
parse_ndpdf('年度申报表详情10024417000014607039.pdf')
# import tabula
#
# df=tabula.read_pdf('D:\\新建文件夹\\excercise-master\\年度申报表详情10024417000014718405.pdf')
# print(df)