def storage(): datapath = os.path.dirname(__file__) + '\data' imgpath = r'D:\\IMG' gmp = GMP(imgpath) regisration = Regisration(imgpath) license = License(imgpath) certificate = ProductionCertificate(datapath, imgpath) pga = Improtdrug(imgpath) for file in os.walk(datapath): id_code = randomidcode() for file_name in file[2]: # if 'GMP证书' in file_name: gmp.gmp(file[0], id_code) # elif "营业执照" in file_name: license.license(file[0], id_code) # elif "药品再注册批件" in file_name: regisration.regisration(file[0], id_code) # elif '药品生产许可证' in file_name: certificate.recognize(file[0], id_code) # elif '说明书' in file_name: introduction.run_introduction(file[0], id_code) # elif '进口药品注册证' in file_name: try: pga.start(file[0], id_code, 'shuai', '') except Exception as e: logmgr = LogMgr() logmgr.error(file[0]+ ":" + str(e)) continue break
def __init__(self, typeid, app_id = APP_ID, api_key = API_KEY, secret_key = SECRET_KEY): self.client = AipOcr(app_id, api_key, secret_key) #self.client = AipOcr(appid[1], apikey[1], secretkey[1]) self.typeid = typeid self.codepath = os.path.dirname(__file__) self.datapath = self.codepath + '\data' os.makedirs(self.datapath, exist_ok=True) self.log = LogMgr()
def json2word(wordlist,savepath,savename):# savepath = './word' # savename = 'test1' emb_filename = os.path.join(savepath, savename+'.doc') if not os.path.isdir(os.path.split(emb_filename)[0]): os.makedirs(os.path.split(emb_filename)[0]) try: with open(emb_filename, "w",encoding='utf-8') as f: for i in wordlist: f.write(i + "\n") f.close() except Exception as e: print(e) log_mgr = LogMgr() log_mgr.error('[mylog]This is error log') # savepath = './word' # savename = 'test1' # wordlist = ["淋日期有合","【有效期】24个月","请仔细阅读说明书井在医师指导下使用"] # json2word(wordlist,savepath,savename)
class JobTable(object): ''' 工作表 ''' db = cxOracle() logmgr = LogMgr() def __init__(self): self.jobdict = dict() #self.jobdict['SER_IP'] = '10.67.28.8' self.dbtable = 'OCRWORKFILE' self.dbflag = 2 def job_add(self, jobtmp): self.jobdict = jobtmp def job_del(self): if self.jobdict: self.jobdict.clear() def update_item(self, find_key, find_value, update_key, update_value): ''' ???????????? @find_key ----???????? @find_value ----????????? @update_key ----???????? @update_value ----???????? ''' self.db.update('OCRWORKFILE', find_key, find_value, update_key, update_value) def job_todb(self): try: jobsql, jobparam = self.db.getsavesql(self.dbtable, self.jobdict, self.dbflag) self.db.insert(jobsql, jobparam) except Exception as e: self.logmgr.error(str(e))
class License(Tools): """ 识别营业执照 """ def __init__(self, imgpath): Tools.__init__(self) self.imgpath = imgpath self.logmgr = LogMgr() def _recognize(self, datas, nums): """ 程序的主逻辑 """ keylist = [] datadict = dict() for (word, i) in zip(datas, range(0, nums)): ''' 循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息; 若提取到了,则保存到datadict中。 若未提取到,list_result为空。有两种情况, 1.这段信息不是我们所需要的。 2.这段信息是上个关键字的值。 然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据 keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。 ''' list_result = self._judge_keywords(word['words']) if '名' == word['words'] and datas[i + 1]['words'][0] == '称': datadict['ENT_NAME'] = datas[i + 1]['words'][1:] continue elif '类' == word['words'] and '型' == datas[i + 1]['words'][0]: datadict['ENT_TYPE'] = datas[i + 1]['words'][1:] continue elif '住' == word['words'] and '所' == datas[i + 1]['words'][0]: datadict['住所'] = datas[i + 1]['words'][1:] continue if list_result != None: if list_result[ 0] in datadict and keylist[-1][0] != list_result[0]: datadict[list_result[0]] += list_result[1] flag = 1 else: datadict[list_result[0]] = list_result[1] flag = 1 #保存关键字段的信息,以及这段信息原本关键字段的信息 keylist.append([list_result[0], list_result[2]]) else: j = i while j > 0: if not keylist: break if keylist[-1][0] == '统一社会信用代码': if re.search(r'[\u4e00-\u9fa5]+', word['words']): break if flag: if keylist[-1][1] in datas[j]['words']: datadict[keylist[-1][0]] += word['words'] break j -= 1 return datadict def _judge_keywords(self, strword): ''' 判断关键字,若识别到关键字,返回一个包含关键字的list。 $resultlist[0] -----要入库的关键字 $resultlist[1] -----提取到内容 $resultlist[2] -----需判断的信息中本来的关键字 如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号'] '证书号:H123', resultlist = ['证书编号', 'H123', '证书号'] ''' re_name = re.compile(r"名称") re_social_code = re.compile(r"统*一社*会信用代码|统一*社会*信用*代码") re_type = re.compile(r"类型") re_residence = re.compile(r"住所") re_legal_representative = re.compile(r"法定*代表*人|法*定代表人*") re_capital = re.compile(r'注册*资本*|注*册资*本') re_establish = re.compile(r'成立*日期*|成*立日*期') re_period = re.compile(r'营业*期限*|营*业期*限') re_scope = re.compile(r"经营*范围*|经*营范*围") re_authority = re.compile(r"登记*机关*|登*记机*关") if len(strword) >= 10: index = 8 elif len(strword) >= 8: index = 6 else: index = len(strword) if re_social_code.search(strword[:index]): return [ '统一社会信用代码', strword[re_social_code.search(strword).span()[1]:], re_social_code.search(strword).group() ] elif re_legal_representative.search(strword[:index]): return [ '法定代表人', strword[re_legal_representative.search(strword).span()[1]:], re_legal_representative.search(strword).group() ] elif re_capital.search(strword[:index]): return [ '注册资本', strword[re_capital.search(strword).span()[1]:], re_capital.search(strword).group() ] elif re_establish.search(strword[:index]): return [ '成立日期', strword[re_establish.search(strword).span()[1]:], re_establish.search(strword).group() ] elif re_period.search(strword[:index]): return [ '营业期限', strword[re_period.search(strword).span()[1]:], re_period.search(strword).group() ] elif re_scope.search(strword[:index]): return [ '经营范围', strword[re_scope.search(strword).span()[1]:], re_scope.search(strword).group() ] elif re_authority.search(strword[:index]): return [ '登记机关', strword[re_authority.search(strword).span()[1]:], re_authority.search(strword).group() ] else: return None def license_deploy(self, imgs, id_code): flag = 0 tmp = '' for file in imgs: file_name = file['imgpath'].split('/')[-1] id = file['imgpath'].split('/')[-2] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] if 'error_code' in file['imgjson']: self.logmgr.error(file['imgpath'] + " : Img Size Error!") continue datas = file['imgjson']['words_result'] nums = file['imgjson']['words_result_num'] if len(datas) > 0 and nums > 0: datadict = self._recognize(datas, nums) ######################################增加部分########################################### datadict['ID_CODE'] = id_code datadict['REMARK'] = '' datadict['ADD_USER'] = '******' datadict['JOB_ID'] = self._generatemd5(file['imgpath']) ######################################增加部分########################################### if not datadict: nums = self._cleandata(datadict, datas, nums) return datadict if '登记机关' in datadict: del datadict['登记机关'] return datadict #try: # #self._data_to_db('BUSINESSLICENCE', datadict) # nums = self._cleandata(datadict, datas, nums) #except Exception as e: # print('Error: ', e) # self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) # self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F') # nums = self._cleandata(datadict, datas, nums) def license(self, path, id_code): flag = 0 temp = '' jobdict = {} for file in os.walk(path): page = 1 for file_name in file[2]: if '营业执照' in file_name: imgname = file_name.split('.')[0] curpath = file[0].split('data')[1] index = imgname.rfind('_') id = curpath[curpath.rfind('\\') + 1:] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] datajson = self._load_json(file[0] + '\\' + file_name) original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf' #服务器 jobdict['SER_IP'] = '10.67.28.8' #job id jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname) jobid = jobdict['JOB_ID'] jobdict['SRC_FILE_NAME'] = imgname[:index - 2] + '.' + 'pdf' jobdict['SRC_FILE_PATH'] = original_path #原文件 jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[ index:].split('_')[1] #原路径 jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath #时间 jobdict['HANDLE_TIME'] = time.strftime( "%Y-%m-%d %X", time.localtime()) #药品名 jobdict['DRUG_NAME'] = dragname #影像件类型 jobdict['FILE_TYPE'] = '营业执照' #同一套影像件识别码 jobdict['ID_CODE'] = id_code #分公司 jobdict['SRC_CO'] = curpath.split('\\')[1] #源文件相对路径 jobdict[ 'FILE_REL_PATH'] = '\\' + imgname[: index] + '.' + imgname[ index:].split( '_')[1] #文件服务器域名 jobdict['SYS_URL'] = '10.67.28.8' #页数 jobdict['PAGE_NUM'] = page #文件ocr解析识别状态 fk sysparams jobdict['OCR_STATE'] = 'T' #备注说明 jobdict['REMARK'] = '' #创建用户 jobdict['ADD_USER'] = '******' #图片过大或者一些原因,没有识别出来就会有error_code字段 if 'error_code' in datajson: jobdict['IS_TO_DB'] = 'F' self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() self.logmgr.error(file[0] + '\\' + file_name + ": img size error!") continue datas = datajson['words_result'] nums = datajson['words_result_num'] flag = 1 #中间文件 jobdict['MID_FILE_NAME'] = file_name #中间文件路径 jobdict['MID_FILE_PATH'] = file[0] #评分 jobdict['OCR_SCORE'] = int(self._getscore(datas, nums)) #影像件内容是否入库 if len(datas) > 0 and nums > 0: jobdict['IS_TO_DB'] = 'T' else: jobdict['IS_TO_DB'] = 'F' #文件文本内容 jobdict['FILE_TEXT'] = self._middict( datas, self.codepath + '\\middata\\' + curpath, imgname) ########################### temp = jobdict['FILE_TEXT'] ########################### #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT']) ############### page += 1 self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() if flag: if len(datas) > 0 and nums > 0: datadict = self._recognize(datas, nums) ######################################增加部分########################################### datadict['ID_CODE'] = id_code datadict['REMARK'] = '' datadict['ADD_USER'] = '******' datadict['JOB_ID'] = self._generatemd5(temp) ######################################增加部分########################################### print(datadict) if not datadict: nums = self._cleandata(datadict, datas, nums) continue if '登记机关' in datadict: del datadict['登记机关'] try: self._data_to_db('BUSINESSLICENCE', datadict) nums = self._cleandata(datadict, datas, nums) except Exception as e: print('Error: ', e) self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) self._update_item('OCRWORKFILE', 'JOB_ID', jobid, 'IS_TO_DB', 'F') nums = self._cleandata(datadict, datas, nums) continue
def __init__(self, imgpath): Tools.__init__(self) self.imgpath = imgpath self.logmgr = LogMgr()
class Regisration(Tools): """ 识别药品再注册批件 """ def __init__(self, imgpath): Tools.__init__(self) self.imgpath = imgpath self.logmgr = LogMgr() def _recognize(self, datas, nums): """ 程序的主逻辑 """ keylist = [] datadict = dict() for (word, i) in zip(datas, range(0, nums)): ''' 循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息; 若提取到了,则保存到datadict中。 若未提取到,list_result为空。有两种情况, 1.这段信息不是我们所需要的。 2.这段信息是上个关键字的值。 然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据 keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。 ''' list_result = self._judge_keywords(word['words']) if list_result != None: if list_result[ 0] in datadict and keylist[-1][0] != list_result[0]: datadict[list_result[0]] += list_result[1] flag = 1 else: datadict[list_result[0]] = list_result[1] flag = 1 #保存关键字段的信息,以及这段信息原本关键字段的信息 keylist.append([list_result[0], list_result[2]]) else: j = i while j > 0: if not keylist: break if keylist[-1][0] == '批准文号': if re.search(r'.?[a-zA-z][0-9]+', word['words']): break if keylist[-1][0] == '规格': if not re.search(r'.*m*g|.*m*l', word['words']): break if flag: if keylist[-1][1] in datas[j]['words']: datadict[keylist[-1][0]] += word['words'] break j -= 1 return datadict def _judge_keywords(self, strword): ''' 判断关键字,若识别到关键字,返回一个包含关键字的list。 $resultlist[0] -----要入库的关键字 $resultlist[1] -----提取到内容 $resultlist[2] -----需判断的信息中本来的关键字 如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号'] '证书号:H123', resultlist = ['证书编号', 'H123', '证书号'] ''' re_coname = re.compile(r"名称") re_num_orig = re.compile(r"原始*编号*|原*始编*号") re_drug_standord = re.compile(r"药品*标准*|药*品标*准") re_drug_valid = re.compile(r"药品*有效期*|药*品有*效期") re_drug_class = re.compile(r"药品*分类*|药*品分*类") re_common_name = re.compile(r'药品*通?用名称?|药*品通?用名?称') re_product_name = re.compile(r'商?品名称?|商?品名?称') re_english = re.compile(r'英文?名称?|英文名?称') re_pinyin = re.compile(r'汉语?拼音?|汉?语拼?音') re_coaddr = re.compile(r"生产*地址*|生*产地*址") re_conclution = re.compile(r"审批*结论*|审*批结*论") re_drug_approval = re.compile(r"药品*批准文*号|药*品批*准文号") re_drug_approval_valid = re.compile(r"药*品批准文号有*效期|药品*批准文号*有效*期") #TODO:有些注册批件的生产厂家 re_annex = re.compile(r"附件") re_zhusong = re.compile(r"主送") re_chaobao = re.compile(r"抄报") re_regisnum = re.compile(r"注册*证号*|注*册证*号") re_regisnum_valid = re.compile(r"注册*证号有效期*|注*册证号有效*期") re_specification = re.compile(r'规格') re_jixing = re.compile(r'剂型') if len(strword) >= 8: index = 6 else: index = len(strword) if (re.match(r'.+?(?:\:)', strword[:index])): if re_common_name.search(strword[:8]): return [ '药品名称', strword[re_common_name.search(strword).span()[1]:], re_common_name.search(strword).group() ] elif re_pinyin.search(strword[:index]): return [ '汉语拼音', strword[re_pinyin.search(strword).span()[1] + 1:], re_pinyin.search(strword).group() ] elif re_coname.search(strword[:4]): return [ '名称', strword[re_coname.search(strword).span()[1] + 1:], re_coname.search(strword).group() ] elif re_coaddr.search(strword[:index]): return [ '生产地址', strword[re_coaddr.search(strword).span()[1] + 1:], re_coaddr.search(strword).group() ] else: return None else: if re_common_name.search(strword[:8]): return [ '药品名称', strword[re_common_name.search(strword).span()[1]:], re_common_name.search(strword).group() ] elif re_pinyin.search(strword[:index]): return [ '汉语拼音', strword[re_pinyin.search(strword).span()[1]:], re_pinyin.search(strword).group() ] elif re_coname.search(strword[:4]): return [ '名称', strword[re_coname.search(strword).span()[1]:], re_coname.search(strword).group() ] elif re_coaddr.search(strword[:index]): return [ '生产地址', strword[re_coaddr.search(strword).span()[1]:], re_coaddr.search(strword).group() ] elif re_conclution.search(strword[:index]): return [ '审批结论', strword[re_conclution.search(strword).span()[1]:], re_conclution.search(strword).group() ] elif re_drug_approval.search(strword[:index]): return [ '再注册证批准文号', strword[re_drug_approval.search(strword).span()[1]:], re_drug_approval.search(strword).group() ] elif re_drug_approval_valid.search(strword[:index]): return [ '药品批准文号有效期', strword[re_drug_approval_valid.search(strword).span()[1]:], re_drug_approval_valid.search(strword).group() ] elif re_regisnum.search(strword[:index]): return [ '注册证号', strword[re_regisnum.search(strword).span()[1]:], re_regisnum.search(strword).group() ] elif re_regisnum_valid.search(strword[:8]): return [ '批准文号有效期', strword[re_regisnum_valid.search(strword).span()[1]:], re_regisnum_valid.search(strword).group() ] elif re_zhusong.search(strword[:index]): return [ '主送', strword[re_zhusong.search(strword).span()[1]:], re_zhusong.search(strword).group() ] elif re_specification.search(strword[:self._short_index(strword)]): return [ '规格', strword[re_specification.search(strword).span()[1]:], re_specification.search(strword).group() ] elif re_jixing.search(strword[:self._short_index(strword):]): return [ '剂型', strword[re_jixing.search(strword).span()[1]:], re_jixing.search(strword).group() ] elif re_drug_class.search(strword[:index]): return [ '药品分类', strword[re_drug_class.search(strword).span()[1]:], re_drug_class.search(strword).group() ] else: return None def regisration_deploy(self, imgs, id_code): flag = 0 tmp = '' for file in imgs: file_name = file['imgpath'].split('/')[-1] id = file['imgpath'].split('/')[-2] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] if 'error_code' in file['imgjson']: self.logmgr.error(file['imgpath'] + " : Img Size Error!") continue datas = file['imgjson']['words_result'] nums = file['imgjson']['words_result_num'] if len(datas) > 0 and nums > 0: datadicttmp = self._recognize(datas, nums) datadict = dict() if '药品名称' in datadicttmp: if re.match('[::]', datadicttmp['药品名称']): datadict['药品名称'] = datadicttmp['药品名称'][1:] else: datadict['药品名称'] = datadicttmp['药品名称'] if '剂型' in datadicttmp: if re.match('[::]', datadicttmp['剂型']): datadict['剂型'] = datadicttmp['剂型'][1:] else: datadict['剂型'] = datadicttmp['剂型'] if '规格' in datadicttmp: if re.match('[::]', datadicttmp['规格']): datadict['规格'] = datadicttmp['规格'][1:] else: datadict['规格'] = datadicttmp['规格'] if '生产厂家' in datadicttmp: if re.match('[::]', datadicttmp['生产厂家']): datadict['生产厂家'] = datadicttmp['生产厂家'][1:] else: datadict['生产厂家'] = datadicttmp['生产厂家'] if '日期' in datadicttmp: if re.match('[::]', datadicttmp['日期']): datadict['日期'] = datadicttmp['日期'][1:] else: datadict['日期'] = datadicttmp['日期'] if not datadict: return 'None' return datadict def regisration(self, path, id_code): flag = 0 temp = '' for file in os.walk(path): page = 1 jobdict = {} for file_name in file[2]: if '药品再注册批件' in file_name: imgname = file_name.split('.')[0] curpath = file[0].split('data')[1] index = imgname.rfind('_') id = curpath[curpath.rfind('\\') + 1:] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] datajson = self._load_json(file[0] + '\\' + file_name) original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf' #服务器 jobdict['SER_IP'] = '10.67.28.8' #job id jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname) jobid = jobdict['JOB_ID'] jobdict['SRC_FILE_NAME'] = imgname[:index - 2] + '.' + 'pdf' jobdict['SRC_FILE_PATH'] = original_path #原文件 jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[ index:].split('_')[1] #原路径 jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath #时间 jobdict['HANDLE_TIME'] = time.strftime( "%Y-%m-%d %X", time.localtime()) #药品名 jobdict['DRUG_NAME'] = dragname #影像件类型 jobdict['FILE_TYPE'] = '药品再注册批件' #同一套影像件识别码 jobdict['ID_CODE'] = id_code #分公司 jobdict['SRC_CO'] = curpath.split('\\')[1] #源文件相对路径 jobdict[ 'FILE_REL_PATH'] = '\\' + imgname[: index] + '.' + imgname[ index:].split( '_')[1] #文件服务器域名 jobdict['SYS_URL'] = '10.67.28.8' #页数 jobdict['PAGE_NUM'] = page #文件ocr解析识别状态 fk sysparams jobdict['OCR_STATE'] = 'T' #备注说明 jobdict['REMARK'] = '' #创建用户 jobdict['ADD_USER'] = '******' #图片过大或者一些原因,没有识别出来就会有error_code字段 if 'error_code' in datajson: jobdict['IS_TO_DB'] = 'F' self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() self.logmgr.error(file[0] + '\\' + file_name + ": img size error!") continue datas = datajson['words_result'] nums = datajson['words_result_num'] flag = 1 #中间文件 jobdict['MID_FILE_NAME'] = file_name #中间文件路径 jobdict['MID_FILE_PATH'] = file[0] #评分 jobdict['OCR_SCORE'] = int(self._getscore(datas, nums)) #影像件内容是否入库 if len(datas) > 0 and nums > 0: jobdict['IS_TO_DB'] = 'T' else: jobdict['IS_TO_DB'] = 'F' #文件文本内容 jobdict['FILE_TEXT'] = self._middict( datas, self.codepath + '\\middata\\' + curpath, imgname) ############### temp = jobdict['FILE_TEXT'] #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT']) ############### page += 1 self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() if flag: if len(datas) > 0 and nums > 0: datadicttmp = self._recognize(datas, nums) datadict = dict() if '药品名称' in datadicttmp: if re.match('[::]', datadicttmp['药品名称']): datadict['药品名称'] = datadicttmp['药品名称'][1:] else: datadict['药品名称'] = datadicttmp['药品名称'] if '剂型' in datadicttmp: if re.match('[::]', datadicttmp['剂型']): datadict['剂型'] = datadicttmp['剂型'][1:] else: datadict['剂型'] = datadicttmp['剂型'] if '规格' in datadicttmp: if re.match('[::]', datadicttmp['规格']): datadict['规格'] = datadicttmp['规格'][1:] else: datadict['规格'] = datadicttmp['规格'] if '生产厂家' in datadicttmp: if re.match('[::]', datadicttmp['生产厂家']): datadict['生产厂家'] = datadicttmp['生产厂家'][1:] else: datadict['生产厂家'] = datadicttmp['生产厂家'] if '日期' in datadicttmp: if re.match('[::]', datadicttmp['日期']): datadict['日期'] = datadicttmp['日期'][1:] else: datadict['日期'] = datadicttmp['日期'] ######################################增加部分########################################### datadict['ID_CODE'] = id_code datadict['REMARK'] = '' datadict['ADD_USER'] = '******' datadict['JOB_ID'] = self._generatemd5(temp) ######################################增加部分########################################### print(datadict) ########################### ########################### if not datadict: nums = self._cleandata(datadict, datas, nums) continue try: self._data_to_db('DRUGREGAPPROVAL', datadict) nums = self._cleandata(datadict, datas, nums) except Exception as e: print('Error: ', e) self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) self._update_item('OCRWORKFILE', 'JOB_ID', jobid, 'IS_TO_DB', 'F') nums = self._cleandata(datadict, datas, nums) continue
import json from DatabaseToolsNew import cxOracle import re from FindKeyword import findImportWords import HowManyColumn4 as hmc #import openpyxl import xlwings as xw import time import hashlib from log import LogMgr from job import JobTable import random from json2word import json2word from tool import Tools logmgr = LogMgr() ''' 使用openpyxl太慢了,改用xlwings wb = openpyxl.load_workbook('C:\\Users\\DevinChang\\Desktop\\四家分公司影印件清单_去重匹配版.xlsx') sheets = wb.sheetnames sheet = wb.get_sheet_by_name(sheets[0]) shopid = sheet['B'] name = sheet['C'] strength = sheet['D'] mfrs = sheet['F'] '''
class ProductionCertificate(Tools): def __init__(self, jsonpath, imgpath): Tools.__init__(self) self.jsonpath = jsonpath self.imgpath = imgpath self.logmgr = LogMgr() def generatemd5(strid): md5 = hashlib.md5() md5.update(strid.encode('utf-8')) return md5.hexdigest() def subfiledata(self,direction, parameter, boundary, datas): leftdata = [] rightdata = [] for data in datas: if direction == 1 or direction == 2: if data['location'][parameter] >= boundary: # 此处有bug leftdata.append(data) else: rightdata.append(data) else: if data['location'][parameter] <= boundary: leftdata.append(data) else: rightdata.append(data) return leftdata+(rightdata) def _productionCertificate(self, datas, nums): """ 识别生产许可证 """ keylist = [] datadict = {}#这里做了一点小改动!!!!!!!!!!!!!!!!!!!1 i = 0 flag = 0 for (word, i) in zip(datas, range(0, nums)): list_result = self._judge_keywords(word['words']) if list_result != None: if list_result[0] in datadict and keylist[-1][0] != list_result[0]: datadict[list_result[0]] += list_result[1] flag = 1 else: datadict[list_result[0]] = list_result[1] flag = 1 keylist.append([list_result[0], list_result[2]]) else: flag = 1 j = i while j >= 0: if not keylist: break if ("分类码" in keylist[-1][0]): if re.match(r'[a-zA-z]+', word['words']): flag = 1 else: break elif "有效期至" in keylist[-1][0]: if re.match(r'[0-9]+年?[0-9]+月?[0-9]+日?', word['words']): flag = 1 else: break # # 字段追加问题 # if re.match(r'.?[::]', word['words'][:10]) and not re.match(r'质*量受*权人*',word['words']): # if not re.match(r'质*量受*权人*', word['words']): # flag = 0 # break if flag: if keylist[-1][1] in datas[j]['words']: datadict[keylist[-1][0]] += word['words'] break j -= 1 # datadict[list_result[0]] = list_result[1] # flag = 1 return datadict def _judge_keywords(self, strword): '''判断关键字''' # re_coname = re.compile(r"企业*名称*|企*业名*称") # re_cernum = re.compile(r"证书*编号*|证*书编*号") # re_addr = re.compile(r"地址") # re_cerscope = re.compile(r"认证*范围*|认*证范*围") # re_valid = re.compile(r"有效期至*|有效*期至") # re_liceauth = re.compile(r"发证*机关*|发*证机*关") # re_licedate = re.compile(r"发证*日期*|发*证日*期") re_entname = re.compile(r"企业*名称*|企*业名*称") re_regAddr = re.compile(r"注册*地址|注册地*址") re_uscc = re.compile(r"社会*信用社*代*码|社*会信用*社*代码*") re_legalReps = re.compile(r"法定*代表*人|法*定代表人*") re_entPrincipal = re.compile(r"企.负责人|.业负*责人|企.负.人") re_qcPrincipal = re.compile(r"质*量负责*人|质*量负责人*|.量负.人*") re_vld = re.compile(r"有*效期*至|有效*期至") re_supervisionDEP = re.compile(r"日常*监管*机构*|日*常监*管机*构") re_supervisor = re.compile(r"日常*监管*人员*|日*常监*管人*员") re_supervisorCT = re.compile(r"监督*举报*电话*|监*督举*报电*话") re_licNO = re.compile(r"编号|編号|号:|号:|号") re_licNO2 = re.compile(r"号") re_cateCode = re.compile(r"分*类码") re_prodAddrScope = re.compile(r"生*产地*址和生产*范*围|生*产*地址和*生*产范*围|.产.址和.产.围|生产地址.生产范.") re_issueOrg = re.compile(r"发证机.|发证.关") re_issuer = re.compile(r"签发*人") re_issueDate = re.compile(r"发证*日*期") re_kindsOfEnterprise = re.compile(r"企业*类型*") re_useLimit = re.compile(r"此*复印件*仅*限用*于*") re_qcLegal = re.compile(r"质*量受*权人*") re_NO = re.compile(r"NO|N0") re_authorizedDEPT = re.compile(r"国*家*食品*药品*监督*管*理局制*|.家*食.药.监督*.理局.") re_country = re.compile(r"中华人民共和国") re_kindsOfDocument = re.compile(r"药品生产许可证") #这里将提取关键字段的长度延长到了12个,尽可能的将由于印章等造成的干扰降低 if len(strword) >= 4: index = 6 else: index = len(strword) if (re.match(r'.+?(?:\:)', strword[:index])): if re_entname.search(strword[:index]): return ['企业名称_许可证', strword[re_entname.search(strword).span()[1]+1:], re_entname.search(strword).group()] elif re_regAddr.search(strword[:index]): return ['注册地址', strword[re_regAddr.search(strword).span()[1] + 1:], re_regAddr.search(strword).group()] elif re_uscc.search(strword[:9]): return ['社会信用社代码', strword[re_uscc.search(strword).span()[1] + 1:], re_uscc.search(strword).group()] elif re_legalReps.search(strword[:7]): return ['法定代表人', strword[re_legalReps.search(strword).span()[1] + 1:], re_legalReps.search(strword).group()] elif re_entPrincipal.search(strword[:7]): return ['企业负责人', strword[re_entPrincipal.search(strword).span()[1] + 1:], re_entPrincipal.search(strword).group()] elif re_qcPrincipal.search(strword[:7]): return ['质量负责人', strword[re_qcPrincipal.search(strword).span()[1] + 1:], re_qcPrincipal.search(strword).group()] elif re_vld.search(strword[:index]): return ['有效期至', strword[re_vld.search(strword).span()[1] + 1:], re_vld.search(strword).group()] elif re_supervisionDEP.search(strword[:8]): return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()] elif re_supervisionDEP.search(strword[:8]): return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()] elif re_supervisor.search(strword[:8]): return ['日常监管人员', strword[re_supervisor.search(strword).span()[1] + 1:], re_supervisor.search(strword).group()] elif re_supervisorCT.search(strword[:8]): return ['监督举报电话', strword[re_supervisorCT.search(strword).span()[1] + 1:], re_supervisorCT.search(strword).group()] elif re_licNO.search(strword[:3]): return ['许可证编号', strword[re_licNO.search(strword).span()[1] + 1:], re_licNO.search(strword).group()] elif re_licNO2.search(strword[:1]): return ['许可证编号', strword[re_licNO2.search(strword).span()[1] + 1:], re_licNO2.search(strword).group()] elif re_cateCode.search(strword[:5]): return ['分类码', strword[re_cateCode.search(strword).span()[1] + 1:], re_cateCode.search(strword).group()] elif re_prodAddrScope.search(strword[:11]): return ['生产地址和生产范围', strword[re_prodAddrScope.search(strword).span()[1] + 1:], re_prodAddrScope.search(strword).group()] elif re_issueOrg.search(strword[:index]): return ['发证机关', strword[re_issueOrg.search(strword).span()[1] + 1:], re_issueOrg.search(strword).group()] elif re_issuer.search(strword[:5]): return ['签发人', strword[re_issuer.search(strword).span()[1] + 1:], re_issuer.search(strword).group()] elif re_issueDate.search(strword[:index]): return ['发证日期', strword[re_issueDate.search(strword).span()[1] + 1:],re_issueDate.search(strword).group()] elif re_kindsOfEnterprise.search(strword[:index]): return ['企业类型', strword[re_kindsOfEnterprise.search(strword).span()[1] + 1:],re_kindsOfEnterprise.search(strword).group()] elif re_useLimit.search(strword[:10]): return ['此复印件仅限用于', strword[re_useLimit.search(strword).span()[1] + 1:],re_useLimit.search(strword).group()] # elif re_qcLegal.search(strword[:index]): # return ['质量受权人', strword[re_qcLegal.search(strword).span()[1] + 1:],re_qcLegal.search(strword).group()] elif re_NO.search(strword[:3]): return ['NO', strword[re_NO.search(strword).span()[1] + 1:],re_NO.search(strword).group()] elif re_authorizedDEPT.search(strword[:13]): return ['国家食品药品监督管理局制', strword[re_authorizedDEPT.search(strword).span()[1]:],re_authorizedDEPT.search(strword).group()] elif re_country.search(strword[:8]): return ['中华人民共和国', strword[re_country.search(strword).span()[1]:],re_country.search(strword).group()] elif re_kindsOfDocument.search(strword[:8]): return ['药品生产许可证', strword[re_kindsOfDocument.search(strword).span()[1]:],re_kindsOfDocument.search(strword).group()] else: return None else: if re_entname.search(strword[:index]): return ['企业名称_许可证', strword[re_entname.search(strword).span()[1]+1:], re_entname.search(strword).group()] elif re_regAddr.search(strword[:index]): return ['注册地址', strword[re_regAddr.search(strword).span()[1] + 1:], re_regAddr.search(strword).group()] elif re_uscc.search(strword[:9]): return ['社会信用社代码', strword[re_uscc.search(strword).span()[1] + 1:], re_uscc.search(strword).group()] elif re_legalReps.search(strword[:7]): return ['法定代表人', strword[re_legalReps.search(strword).span()[1] + 1:], re_legalReps.search(strword).group()] elif re_entPrincipal.search(strword[:7]): return ['企业负责人', strword[re_entPrincipal.search(strword).span()[1] + 1:], re_entPrincipal.search(strword).group()] elif re_qcPrincipal.search(strword[:7]): return ['质量负责人', strword[re_qcPrincipal.search(strword).span()[1] + 1:], re_qcPrincipal.search(strword).group()] elif re_vld.search(strword[:index]): return ['有效期至', strword[re_vld.search(strword).span()[1] + 1:], re_vld.search(strword).group()] elif re_supervisionDEP.search(strword[:8]): return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()] elif re_supervisionDEP.search(strword[:8]): return ['日常监管机构', strword[re_supervisionDEP.search(strword).span()[1] + 1:], re_supervisionDEP.search(strword).group()] elif re_supervisor.search(strword[:8]): return ['日常监管人员', strword[re_supervisor.search(strword).span()[1] + 1:], re_supervisor.search(strword).group()] elif re_supervisorCT.search(strword[:8]): return ['监督举报电话', strword[re_supervisorCT.search(strword).span()[1] + 1:], re_supervisorCT.search(strword).group()] elif re_licNO.search(strword[:3]): return ['许可证编号', strword[re_licNO.search(strword).span()[1] + 1:], re_licNO.search(strword).group()] elif re_cateCode.search(strword[:5]): return ['分类码', strword[re_cateCode.search(strword).span()[1] + 1:], re_cateCode.search(strword).group()] elif re_prodAddrScope.search(strword[:11]): return ['生产地址和生产范围', strword[re_prodAddrScope.search(strword).span()[1] + 1:], re_prodAddrScope.search(strword).group()] elif re_issueOrg.search(strword[:index]): return ['发证机关', strword[re_issueOrg.search(strword).span()[1] + 1:], re_issueOrg.search(strword).group()] elif re_issuer.search(strword[:5]): return ['签发人', strword[re_issuer.search(strword).span()[1] + 1:], re_issuer.search(strword).group()] elif re_issueDate.search(strword[:index]): return ['发证日期', strword[re_issueDate.search(strword).span()[1] + 1:],re_issueDate.search(strword).group()] elif re_kindsOfEnterprise.search(strword[:index]): return ['企业类型', strword[re_kindsOfEnterprise.search(strword).span()[1] + 1:],re_kindsOfEnterprise.search(strword).group()] elif re_useLimit.search(strword[:10]): return ['此复印件仅限用于', strword[re_useLimit.search(strword).span()[1] + 1:],re_useLimit.search(strword).group()] # elif re_qcLegal.search(strword[:index]): # return ['质量受权人', strword[re_qcLegal.search(strword).span()[1] + 1:], re_qcLegal.search(strword).group()] elif re_NO.search(strword[:3]): return ['NO', strword[re_NO.search(strword).span()[1]+1:], re_NO.search(strword).group()] elif re_authorizedDEPT.search(strword[:13]): return ['国家食品药品监督管理局制', strword[re_authorizedDEPT.search(strword).span()[1]:],re_authorizedDEPT.search(strword).group()] elif re_country.search(strword[:8]): return ['中华人民共和国', strword[re_country.search(strword).span()[1]:], re_country.search(strword).group()] elif re_kindsOfDocument.search(strword[:8]): return ['药品生产许可证', strword[re_kindsOfDocument.search(strword).span()[1]:],re_kindsOfDocument.search(strword).group()] else: return None def recognize_deploy(self, imgs, id_code): nums = 0 flag = 0 temp = '' datas = [] for file in imgs: #提取药品名称 id = file['imgpath'].split('/')[-2] file_name = file['imgpath'].split('/')[-1] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if 'error_code' in file['imgjson']: self.logmgr.error(file['imgpath'] + ' : ' + 'Size Error!') #判别是否是多栏 try: kindict = hmc.kinds(file['imgpath'], file['imgjson']) except Exception as e: self.logmgr.error(file['imgpath'] + ' : ' + 'Size Error!') continue print('Current processing: {}'.format(file['imgpath'])) #提取关键信息 datatmp = file['imgjson']['words_result'] nums += file['imgjson']['words_result_num'] if kindict['kinds'] == 2: datas += subfiledata(kindict['direction'], kindict['parameter'], kindict['boundary'][0], datatmp) elif kindict['kinds'] == 1: datas += datatmp if len(datas) > 0 and nums > 0: datadict = self._productionCertificate(datas, nums) if '企业类型' in datadict: del datadict['企业类型'] if '此复印件仅限于' in datadict: del datadict['次复印件仅限于'] if 'NO' in datadict: del datadict['NO'] if '国家食品药品监督管理局制' in datadict: del datadict['国家食品药品监督管理局制'] if '中华人民共和国' in datadict: del datadict['中华人民共和国'] if '药品许可证' in datadict: del datadict['药品许可证'] ######################################增加部分########################################### datadict['ID_CODE'] = id_code datadict['REMARK'] = '' datadict['ADD_USER'] = '******' datadict['JOB_ID'] = self._generatemd5(temp) ######################################增加部分########################################### if not datadict: nums = self._cleandata(datadict, datas, nums) return datadict return datadict #try: # #self._data_to_db('DRUGMFRSCERT', datadict) # nums = self._cleandata(datadict, datas, nums) #except Exception as e: # #self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) # #self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F') # nums = self._cleandata(datadict, datas, nums) # continue def recognize(self, path, id_code): flag = 0 page = 0 temp ='' jobdict = {} for file in os.walk(path):#这里将原来imgpath换成了 jsonpath for file_name in file[2]: if '生产许可证' in file_name: jsonname = file_name.split('.')[0] curpath = file[0].split('data')[1] index = jsonname.rfind('_') id = curpath[curpath.rfind('\\') + 1:] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] jsonPath = file[0] + '\\' + file_name datajson = self._load_json(file[0] + '\\' + file_name) source_img_path = self.imgpath + curpath + '\\' + jsonname[:index] + '.' + jsonname[index:].split('_')[1] original_path = self.imgpath + '\\' + curpath + '\\' + jsonname[:index - 2] + '.' + 'pdf' #服务器 jobdict['SER_IP'] = '10.67.28.8' #job id jobdict['JOB_ID'] = self._generatemd5(file[0] + jsonname) jobid = jobdict['JOB_ID'] jobdict['SRC_FILE_NAME'] = jsonname[:index - 2] + '.' + 'pdf' jobdict['SRC_FILE_PATH'] = original_path #原文件 jobdict['CUT_FILE_NAME'] = jsonname[:index] + '.' + jsonname[index:].split('_')[1] #原路径 jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath #时间 jobdict['HANDLE_TIME'] = time.strftime("%Y-%m-%d %X", time.localtime()) #药品名 jobdict['DRUG_NAME'] = dragname #影像件类型 jobdict['FILE_TYPE'] = '药品生产许可证' #同一套影像件识别码 jobdict['ID_CODE'] = id_code #分公司 jobdict['SRC_CO'] = curpath.split('\\')[1] #源文件相对路径 jobdict['FILE_REL_PATH'] = '\\' + jsonname[:index] + '.' + jsonname[index:].split('_')[1] #文件服务器域名 jobdict['SYS_URL'] = '10.67.28.8' #页数 jobdict['PAGE_NUM'] = page #文件ocr解析识别状态 fk sysparams jobdict['OCR_STATE'] = 'T' #备注说明 jobdict['REMARK'] = '' #创建用户 jobdict['ADD_USER'] = '******' # 图片过大或者一些原因,没有识别出来就会有error_code字段 if 'error_code' in datajson: jobdict['IS_TO_DB'] = 'F' self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() self.logmgr.error(file[0] + '\\' + file_name + ": img size error!") continue #source_img_path = 'img\\'+jsonname+'.jpg' #由于需要增加分栏的程序所以,需要图片的路径,但是目前这里面的路径存在一定的问题 # source_img_path = file[0] + '\\' + file_name # original_path = path_root + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf' # FIXME:换工作环境这里也得改! try: kindict = hmc.kinds(source_img_path, jsonPath) except Exception as e: self.logmgr.error(file[0] + '\\' + file_name + ':' + str(e)) continue #index = jsonname.rfind('.') # print('Current processing: {}'.format(source_img_path + '\\' + # '\\' + imgname[:index] + # '.' + imgname[index:].split('.')[1], # file[0] + '\\' + file_name)) datas = datajson['words_result'] nums = datajson['words_result_num'] if kindict['kinds'] == 2: datas = self.subfiledata(kindict['direction'], kindict['parameter'], kindict['boundary'][0],datas) elif kindict['kinds'] == 1 or kindict['kinds'] == 0: datas = datas flag = 1 page += 1 #中间文件 jobdict['MID_FILE_NAME'] = file_name #中间文件路径 jobdict['MID_FILE_PATH'] = file[0] #评分 jobdict['OCR_SCORE'] = int(self._getscore(datas, nums)) #影像件内容是否入库 if len(datas) > 0 and nums > 0: jobdict['IS_TO_DB'] = 'T' else: jobdict['IS_TO_DB'] = 'F' #文件文本内容 jobdict['FILE_TEXT'] = self._middict(datas, self.codepath + '\\middata\\' + curpath, jsonname) ############### temp = jobdict['FILE_TEXT'] #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT']) ############### page += 1 self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() if flag: if len(datas) > 0 and nums > 0: datadict = self._productionCertificate(datas, nums) if '企业类型' in datadict: del datadict['企业类型'] if '此复印件仅限于' in datadict: del datadict['次复印件仅限于'] if 'NO' in datadict: del datadict['NO'] if '国家食品药品监督管理局制' in datadict: del datadict['国家食品药品监督管理局制'] if '中华人民共和国' in datadict: del datadict['中华人民共和国'] if '药品许可证' in datadict: del datadict['药品许可证'] print(source_img_path) ######################################增加部分########################################### datadict['ID_CODE'] = id_code datadict['REMARK'] = '' datadict['ADD_USER'] = '******' datadict['JOB_ID'] = self._generatemd5(temp) ######################################增加部分########################################### print(datadict) if not datadict: nums = self._cleandata(datadict, datas, nums) continue try: self._data_to_db('DRUGMFRSCERT', datadict) nums = self._cleandata(datadict, datas, nums) except Exception as e: print('Error: ', e) self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F') nums = self._cleandata(datadict, datas, nums) continue
class GMP(Tools): """ GMP证书的识别 """ def __init__(self, imgpath): Tools.__init__(self) self.imgpath = imgpath self.logmgr = LogMgr() def _recognize(self,datas, nums): """ 识别GMP证书, 程序的主逻辑 """ keylist = [] datadict = dict() for (word, i) in zip(datas, range(0, nums)): ''' 循环读识别出的数据,然后根据judge_keywords函数是否提取到了关键信息; 若提取到了,则保存到datadict中。 若未提取到,list_result为空。有两种情况, 1.这段信息不是我们所需要的。 2.这段信息是上个关键字的值。 然后执行else,进行更精确的判别。若是需归到上个字段,则循环递减,根据 keylist[1],也就是list_reault[2]是否出现再上面的某个字段。若有则追加。 ''' list_result = self._judge_keywords(word['words']) if list_result != None: if list_result[0] in datadict and keylist[-1][0] != list_result[0]: datadict[list_result[0]] += list_result[1] flag = 1 else: datadict[list_result[0]] = list_result[1] flag = 1 #保存关键字段的信息,以及这段信息原本关键字段的信息 keylist.append([list_result[0],list_result[2]]) else: j = i while j > 0: if not keylist: break #FIXMEED:逻辑问题 4/10 DONE if re.match(r'\s[a-zA-Z]+', word['words']): break #提取"有效期至"与"发证日期"字段 if re.match(r'\d{4}|\d{2}', word['words']): if len(word['words']) <= 4: break elif '/' in word['words']: if keylist[-1][0] == '发证机关': datadict['发证日期'] = word['words'] keylist.append(['杂', '杂']) break if '有效期至' in datadict: if re.search(r'\d{4}|\d{2}', datadict['有效期至']): break else: datadict['有效期至'] = word['words'] break if flag: if keylist[-1][0] == '地址': if i + 1 >= nums: break is_scope = self._judge_keywords(datas[i + 1]['words']) if is_scope != None and is_scope[0] == '认证范围': datadict['认证范围'] = word['words'] break if keylist[-1][0] == '有效期至': break if keylist[-1][1] in datas[j]['words']: datadict[keylist[-1][0]] += word['words'] break j -= 1 return datadict def _judge_keywords(self, strword): ''' 判断关键字,若识别到关键字,返回一个包含关键字的list。 $resultlist[0] -----要入库的关键字 $resultlist[1] -----提取到内容 $resultlist[2] -----需判断的信息中本来的关键字 如:'证书编号:H12345',resultlist = ['证书编号', 'H12345', '证书编号'] '证书号:H123', resultlist = ['证书编号', 'H123', '证书号'] ''' re_coname = re.compile(r"企业*名称*|企*业名*称") re_cernum = re.compile(r"证书*编号*|证*书编*号") re_addr = re.compile(r"地址") re_cerscope = re.compile(r"认证*范围*|认*证范*围") re_valid = re.compile(r"有效期至*|有效*期至") re_liceauth = re.compile(r"发证*机关*|发*证机*关") re_licedate = re.compile(r"发证*日期*|发*证日*期") re_abandon = re.compile(r"经审*查") if len(strword) >= 8: index = 6 else: index = len(strword) if(re.match(r'.+?(?:\:)', strword[:index])): if re_coname.search(strword[:index]): return ['企业名称_GMP', strword[re_coname.search(strword).span()[1]:], re_coname.search(strword).group()] elif re_cernum.search(strword[:index]): return ['证书编号' , strword[re_cernum.search(strword).span()[1] + 1:], re_cernum.search(strword).group()] elif re_addr.search(strword[:self._sort_index(strword)]): return ['地址' , strword[re_addr.search(strword).span()[1]:],re_addr.search(strword).group()] elif re_cerscope.search(strword[:index]): return ['认证范围' , strword[re_cerscope.search(strword).span()[1]:],re_cerscope.search(strword).group()] elif re_valid.search(strword[:index]): return ['有效期至' , strword[re_valid.search(strword).span()[1]:],re_valid.search(strword).group()] elif re_liceauth.search(strword[:index]): return ['发证机关' , strword[re_liceauth.search(strword).span()[1]:],re_liceauth.search(strword).group()] elif re_licedate.search(strword[:index]): return ['发证时间' , strword[re_licedate.search(strword).span()[1]:],re_licedate.search(strword).group()] else: return None else: if re_coname.search(strword[:index]): return ['企业名称_GMP', strword[re_coname.search(strword).span()[1]:], re_coname.search(strword).group()] elif re_cernum.search(strword[:index]): return ['证书编号' , strword[re_cernum.search(strword).span()[1] + 1:], re_cernum.search(strword).group()] elif re_addr.search(strword[:self._sort_index(strword)]): return ['地址' , strword[re_addr.search(strword).span()[1]:],re_addr.search(strword).group()] elif re_cerscope.search(strword[:index]): return ['认证范围' , strword[re_cerscope.search(strword).span()[1]:],re_cerscope.search(strword).group()] elif re_valid.search(strword[:index]): return ['有效期至' , strword[re_valid.search(strword).span()[1]:],re_valid.search(strword).group()] elif re_liceauth.search(strword[:index]): return ['发证机关' , strword[re_liceauth.search(strword).span()[1]:],re_liceauth.search(strword).group()] elif re_licedate.search(strword[:index]): return ['发证时间' , strword[re_licedate.search(strword).span()[1]:],re_licedate.search(strword).group()] elif re_abandon.search(strword[:index]): return ['经审查', strword[re_abandon.search(strword).span()[1]:], re_abandon.search(strword).group()] else: return None def gmp_delploy(self, imgs, idcode): flag = 0 tmp = '' #datas = [] for file in imgs: file_name = file['imgpath'].split('/')[-1] id = file['imgpath'].split('/')[-2] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] if 'error_code' in file['imgjson']: self.logmgr.error(file['imgpath'] + " : Img Size Error!") continue datas = file['imgjson']['words_result'] nums = file['imgjson']['words_result_num'] if len(datas) > 0 and nums > 0: datadicttmp = self._recognize(datas, nums) datadict = dict() if '企业名称_GMP' in datadicttmp: if re.match('[::]',datadicttmp['企业名称_GMP']): datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'][1:] else: datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'] if '证书编号' in datadicttmp: if re.match('[::]',datadicttmp['证书编号']): datadict['证书编号'] = datadicttmp['证书编号'][1:] else: datadict['证书编号'] = datadicttmp['证书编号'] if '地址' in datadicttmp: if re.match('[::]',datadicttmp['地址']): datadict['地址'] = datadicttmp['地址'][1:] else: datadict['地址'] = datadicttmp['地址'] if '认证范围' in datadicttmp: if re.match('[::]',datadicttmp['认证范围']): datadict['认证范围'] = datadicttmp['认证范围'][1:] else: datadict['认证范围'] = datadicttmp['认证范围'] if '有效期至' in datadicttmp: if re.match('[::]',datadicttmp['有效期至']): datadict['有效期至'] = datadicttmp['有效期至'][1:] else: datadict['有效期至'] = datadicttmp['有效期至'] if '发证机关' in datadicttmp: if re.match('[::]',datadicttmp['发证机关']): datadict['发证机关'] = datadicttmp['发证机关'][1:] else: datadict['发证机关'] = datadicttmp['发证机关'] if '发证日期' in datadicttmp: if re.match('[::]',datadicttmp['发证日期']): datadict['发证日期'] = datadicttmp['发证日期'][1:] else: datadict['发证日期'] = datadicttmp['发证日期'] if '地址' not in datadict: datadict['地址'] = '' if '企业名称_GMP' not in datadict: datadict['企业名称_GMP'] = '' if re.search(r'.+公司.+',datadict['企业名称_GMP']): datadict['地址'] = datadict['地址']+datadict['企业名称_GMP'].split('公司')[1] datadict['企业名称_GMP'] = datadict['企业名称_GMP'].split('公司')[0]+'公司' if not datadict: nums = self._cleandata(datadict, datas, nums) return datadict return datadict #try: # #self._data_to_db('GMPCERT', datadict) # nums = self._cleandata(datadict, datas, nums) #except Exception as e: # print('Error: ', e) # #self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F') # self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) # nums = self._cleandata(datadict, datas, nums) # return 'None' def gmp(self, datapath, id_code): flag = 0 temp = '' for file in os.walk(datapath): jobdict = {} for file_name in file[2]: page = 1 if 'GMP证书' in file_name: imgname = file_name.split('.')[0] curpath = file[0].split('data')[1] index = imgname.rfind('_') id = curpath[curpath.rfind('\\') + 1:] if re.search(r'[\u4e00-\u9fa5]+', id): dragname = re.search(r'[\u4e00-\u9fa5]+', id).group() else: dragname = re.search(r'[\u4e00-\u9fa5]+', file_name).group() if dragname.find('(') > 0: dragname = dragname[:dragname.find('(')] #id_code = id[name_index_e - 1:] datajson = self._load_json(file[0] + '\\' + file_name) original_path = self.imgpath + '\\' + curpath + '\\' + imgname[:index - 2] + '.' + 'pdf' #服务器 jobdict['SER_IP'] = '10.67.28.8' #job id jobdict['JOB_ID'] = self._generatemd5(file[0] + imgname) jobid = jobdict['JOB_ID'] jobdict['SRC_FILE_NAME'] = imgname[:index - 2] + '.' + 'pdf' jobdict['SRC_FILE_PATH'] = original_path # jobdict['JOB_ID'] = self._generatemd5(jobdict[]) #原文件 jobdict['CUT_FILE_NAME'] = imgname[:index] + '.' + imgname[index:].split('_')[1] #原路径 jobdict['CUT_FILE_PATH'] = 'G:\\IMG' + '\\' + curpath #时间 jobdict['HANDLE_TIME'] = time.strftime("%Y-%m-%d %X", time.localtime()) #药品名 jobdict['DRUG_NAME'] = dragname #影像件类型 jobdict['FILE_TYPE'] = 'GMP证书' #同一套影像件识别码 jobdict['ID_CODE'] = id_code #分公司 jobdict['SRC_CO'] = curpath.split('\\')[1] #源文件相对路径 jobdict['FILE_REL_PATH'] = '\\' + imgname[:index] + '.' + imgname[index:].split('_')[1] #文件服务器域名 jobdict['SYS_URL'] = '10.67.28.8' #页数 jobdict['PAGE_NUM'] = page #文件ocr解析识别状态 fk sysparams jobdict['OCR_STATE'] = 'T' #备注说明 jobdict['REMARK'] = '' #创建用户 jobdict['ADD_USER'] = '******' #图片过大或者一些原因,没有识别出来就会有error_code字段 if 'error_code' in datajson: jobdict['IS_TO_DB'] = 'F' self.job.job_add(jobdict) self.job.job_todb() self.job.job_del() self.logmgr.error(file[0] + '\\' + file_name + ": img size error!") continue #source_img_path = imgpaht_root_desktop + '\\' + curpath + '\\' + imgname[:index] + '.' + imgname[index:].split('_')[1] #try: # kindict = hmc.kinds(source_img_path, datajson) #except Exception as e: # logmgr.error(file[0] + '\\' + file_name + ':' + str(e)) # continue #print('Current processing: {}'.format(imgpaht_root_desktop + '\\' + curpath + # '\\' + imgname[:index] + # '.' + imgname[index:].split('_')[1], # file[0] + '\\' + file_name)) datas = datajson['words_result'] nums = datajson['words_result_num'] flag = 1 #中间文件 jobdict['MID_FILE_NAME'] = file_name #中间文件路径 jobdict['MID_FILE_PATH'] = file[0] #评分 jobdict['OCR_SCORE'] = int(self._getscore(datas, nums)) #影像件内容是否入库 if len(datas) > 0 and nums > 0: jobdict['IS_TO_DB'] = 'T' else: jobdict['IS_TO_DB'] = 'F' #文件文本内容 jobdict['FILE_TEXT'] = self._middict(datas, self.codepath + '\\middata\\' + curpath, imgname) ############### temp = jobdict['FILE_TEXT'] #jobdict['JOB_ID'] = self._generatemd5(jobdict['FILE_TEXT']) ############### try: self.job.job_add(jobdict) except Exception: self.job.update_item('JOB_ID', jobid, 'IS_TO_DB', 'F') self.job.job_todb() self.job.job_del() if flag: if len(datas) > 0 and nums > 0: datadicttmp = self._recognize(datas, nums) datadict = dict() if '企业名称_GMP' in datadicttmp: if re.match('[::]',datadicttmp['企业名称_GMP']): datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'][1:] else: datadict['企业名称_GMP'] = datadicttmp['企业名称_GMP'] if '证书编号' in datadicttmp: if re.match('[::]',datadicttmp['证书编号']): datadict['证书编号'] = datadicttmp['证书编号'][1:] else: datadict['证书编号'] = datadicttmp['证书编号'] if '地址' in datadicttmp: if re.match('[::]',datadicttmp['地址']): datadict['地址'] = datadicttmp['地址'][1:] else: datadict['地址'] = datadicttmp['地址'] if '认证范围' in datadicttmp: if re.match('[::]',datadicttmp['认证范围']): datadict['认证范围'] = datadicttmp['认证范围'][1:] else: datadict['认证范围'] = datadicttmp['认证范围'] if '有效期至' in datadicttmp: if re.match('[::]',datadicttmp['有效期至']): datadict['有效期至'] = datadicttmp['有效期至'][1:] else: datadict['有效期至'] = datadicttmp['有效期至'] if '发证机关' in datadicttmp: if re.match('[::]',datadicttmp['发证机关']): datadict['发证机关'] = datadicttmp['发证机关'][1:] else: datadict['发证机关'] = datadicttmp['发证机关'] if '发证日期' in datadicttmp: if re.match('[::]',datadicttmp['发证日期']): datadict['发证日期'] = datadicttmp['发证日期'][1:] else: datadict['发证日期'] = datadicttmp['发证日期'] ######################################增加部分########################################### datadict['ID_CODE']=id_code datadict['REMARK']='' datadict['ADD_USER']='******' datadict['JOB_ID'] = self._generatemd5(temp) if '地址' not in datadict: datadict['地址'] = '' if '企业名称_GMP' not in datadict: datadict['企业名称_GMP'] = '' if re.search(r'.+公司.+',datadict['企业名称_GMP']): datadict['地址'] = datadict['地址']+datadict['企业名称_GMP'].split('公司')[1] datadict['企业名称_GMP'] = datadict['企业名称_GMP'].split('公司')[0]+'公司' ######################################增加部分########################################### print(datadict) if not datadict: nums = self._cleandata(datadict, datas, nums) continue try: self._data_to_db('GMPCERT', datadict) nums = self._cleandata(datadict, datas, nums) except Exception as e: print('Error: ', e) self._update_item('OCRWORKFILE','JOB_ID', jobid,'IS_TO_DB','F') self.logmgr.error(file[0] + '\\' + file_name + "insert error!! : " + str(e)) nums = self._cleandata(datadict, datas, nums) continue
class MyOcr(object): """ 文字识别 @app_id @api_key @secret_key 为百度ai平台上申请的值 @typeid 精度的选择 1--调用通用文字识别 2--含位置信息的通用文字识别 3--高精度的文字识别 4--含位置信息的高精度文字识别 """ def __init__(self, typeid, app_id = APP_ID, api_key = API_KEY, secret_key = SECRET_KEY): self.client = AipOcr(app_id, api_key, secret_key) #self.client = AipOcr(appid[1], apikey[1], secretkey[1]) self.typeid = typeid self.codepath = os.path.dirname(__file__) self.datapath = self.codepath + '\data' os.makedirs(self.datapath, exist_ok=True) self.log = LogMgr() def _get_file_content(self, filePath): """读取图片""" with open(filePath, 'rb') as fp: return fp.read() def _write_json_file(self, filepath, data): """写入json文件""" with open(filepath, 'w', encoding = 'utf-8') as fw: fw.write(json.dumps(data, ensure_ascii=False)) def _list_custom(self, path): root = os.listdir(path) return os.listdir(path + '\\' + root[0]), path + '\\' + root[0] def ocr_deploy(self, rec_dict): files = rec_dict['files'] #ocr所需的参数 options = {} options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" #dirlist = os.listdir(imgpath) #dirlist, root = self._list_custom(imgpath) for file in files: if re.search(r'进口注册证|GMP|说明书|药品再注册批件|营业执照|生产许可证|进口药品许可证|进口药品注册证', file['type']): for img in file['imgs']: print('Current img: {}'.format(img['imgpath'])) try: data = self.client.accurate(base64.b64decode(bytes(img['base64'], encoding='utf-8')), options) except Exception as e: print('Error: ', e) self.log.error(img['imgpath'] + "Error! : " + str(e)) continue img.update({"imgjson" : data}) return rec_dict def _ocr(self, imgpath): """ 识别img文件下的图片 @输出json数据,保存到data文件夹下 """ #imgpath = self.codepath + '\IMG'+'\国控天星' #FIXME:电脑环境不同,路径也不一样,切换环境的话要修改路径 #imgpath = 'F:\IMG' #imgpath = r'D:\IMG' options = {} options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" #FIXME:图片路径需改 dirlist = os.listdir(imgpath) root = imgpath #dirlist, root = self._list_custom(imgpath) for file in os.walk(imgpath): for file_name in file[2]: if re.search(r'进口注册证|GMP|说明书|药品再注册批件|营业执照|生产许可证|进口药品许可证', file_name): if '备案' in file_name: continue if os.path.isdir(file[0] + '\\' + file_name): continue if not re.match(r'[jJ][pP][gG]', file_name[-3:]): continue datafilepath = self.datapath + file[0].split('IMG')[1] if not os.path.exists(datafilepath): os.makedirs(datafilepath) img = self._get_file_content(file[0] + '\\' + file_name) if file_name[:-4].find('.'): file_name = file_name[:-4].replace('.', '') + file_name[-4:] try: prefix,suffix = file_name.split('.') except Exception as e: print('split error: {}\ncurrent file: {}'.format(e, file[0] + '\\' + file_name)) self.log.error(file[0] + '\\' + file_name + " Error!! : " + str(e)) continue #判断文件是否存在 if os.path.isfile((datafilepath +'\{}.json').format(prefix + '_' + suffix)): continue print('Current img: {}'.format(file[0] + '\\' + file_name)) #FIXME: testdict = dict() testdict['base64'] = str(base64.b64encode(img), 'utf-8') #img_test = str.encode(testdict['base64']) #self._write_json_file('F:\\IMG\\11A0015\\test.json', str(img)) try: if self.typeid == 1: data = self.client.basicGeneral(img, options) elif self.typeid == 2: data = self.client.general(img, options) elif self.typeid == 3: data = self.client.basicAccurate(base64.b64decode(bytes(testdict['base64'], encoding='utf-8')), options) elif self.typeid == 4: data = self.client.accurate(img, options) except Exception as e: print('Error: ', e) self.log.error(file[0] + '\\' + file_name + " Error!! : " + str(e)) continue self._write_json_file((datafilepath +'\{}.json').format(prefix + '_' + suffix), data) def _write_dict(self): files = os.listdir(self.datapath) for file in files: format_data = introduction.introduction(self.datapath + '\\' + file) print(format_data) def pdf2img(self): """pdf转jpg""" file_dir = self.codepath + '/PDF/说明书/' save_dir = self.codepath + '/IMG/图片/' for files in os.walk(file_dir): for file_name in files[2]: file_path = file_dir [file_name_prefix, file_name_suffix] = file_name.split('.') file = file_dir + file_name with(Image(filename=file, resolution=300)) as img: images = img.sequence pages = len(images) for i in range(pages): images[i].type = 'truecolor' save_name = save_dir + file_name_prefix + str(i) + '.jpg' Image(images[i]).save(filename=save_name) def run(self, imgpath): """入口函数""" print('********Start Identify********') self._ocr(imgpath) print('********End********')
# -*- coding : utf-8 -*- import os import json from DatabaseToolsNew import cxOracle import re from FindKeyword import findImportWords import HowManyColumn4 as hmc #import openpyxl import xlwings as xw from log import LogMgr logmgr = LogMgr() ''' 使用openpyxl太慢了,改用xlwings wb = openpyxl.load_workbook('C:\\Users\\DevinChang\\Desktop\\四家分公司影印件清单_去重匹配版.xlsx') sheets = wb.sheetnames sheet = wb.get_sheet_by_name(sheets[0]) shopid = sheet['B'] name = sheet['C'] strength = sheet['D'] mfrs = sheet['F'] ''' def load_excel(excel): wb = xw.Book(excel) sheet = wb.sheets[0] shopid = sheet['B:B'].value name = sheet['C:C'].value strength = sheet['D:D'].value