def baidu_image2str_url(uuid_url_dict={}, types="characters"): from aip import AipOcr client = AipOcr(Baidu_APP_ID, Baidu_API_KEY, Baidu_SECRET_KEY) # create a connection options = {} options["probability"] = "true" uuid_text_dict = {} for uuid, url in uuid_url_dict.items(): ret = "" resp = client.basicGeneralUrl(url, options) # url # print(resp) if "error_msg" in resp: print("url recognition failed! Using local model. url: " + url) # print(resp) if resp["error_msg"] == "url response invalid" or resp[ "error_msg"] == "image size error": #request for the image of url, convert to valid format image_path = image_transform(url_img_download(url)) print(image_path) uuid_text_dict[uuid] = baidu_image2str_local(image_path) else: uuid_text_dict[uuid] = "" else: for tex in resp["words_result"]: if tex["probability"]["average"] > 0.85: ret = ret + tex["words"] # print(ret) uuid_text_dict[uuid] = ret return uuid_text_dict
class BaiduAIP(object): def __init__(self): self.client = AipOcr(APP_ID, API_KEY, SECRET_KEY) # 读取图片文件 # 返回二进制内容 def get_file_content(self, file_path): with open(file_path, 'rb') as fp: return fp.read() # 调用百度AIP并解析接口返回数据 # 返回多个内容 => 重做 # 平均值小于0.8 => 重做 # 成功 => 返回code def get_code(self): # image = self.get_file_content(self.picture_path) """ 如果有可选参数 """ options = { "language_type": "ENG", "detect_direction": "true", "detect_language": "true", "probability": "true" } response = self.client.basicGeneralUrl(self.picture_path, options) print(response, self.picture_path) data = response if isinstance(data['words_result'], list) and data['words_result'].__len__() == 1: if data['words_result'][0]['probability'][ 'average'] > 0.7: # 准确率达到0.7以上 return data['words_result'][0]['words'] else: return 'do again' else: return 'do again'
def getOCRResult(self): client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY) """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "false" """ 带参数调用通用文字识别, 图片参数为本地图片 """ back = client.basicGeneralUrl(self.imgURL, options) return back['words_result']
def pic_to_word(urls): APP_ID = '16459310' API_KEY = 'XMntwgKcwzsuuLUIqhBWw9uZ' SECRET_KEY = 'iWESOZKlmB5vw3hL5T3MniNTdZCfAgL4' # options = {"language_type": type} client = AipOcr(APP_ID, API_KEY, SECRET_KEY) """ 带参数调用通用文字识别, 图片参数为远程url图片 """ text = client.basicGeneralUrl(urls) print(text) str_num = '' for i in range(0, text['words_result_num']): str_num = str_num + str(text['words_result'][i]['words']) + '\n' return str_num
def parse_url_pdf(url): APP_ID = '14964808' API_KEY = 'AVWLHd7wAOxf4kijuImGZzVH' SECRET_KEY = 'SEZTAAYH92VFTFXEvc75Vyi4nROfE0I0' options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" client = AipOcr(APP_ID, API_KEY, SECRET_KEY) # 通用文字识别 content = client.basicGeneralUrl(url, options) print(content)
def verifying_code( url='https://paulzhangcc.oss-cn-beijing.aliyuncs.com/1.png'): """ 你的 APPID AK SK """ APP_ID = '11086206' API_KEY = 'YTDMZxqm63fokY7UTOxSVowX' SECRET_KEY = 'HkTXYq0BzQ85kkvTAxemQi4SVNwaNUDf' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "false" options["detect_language"] = "true" options["language_type"] = "CHN_ENG" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ result = client.basicGeneralUrl(url, options) print("识别图片百度云返回的结果:", result) return result
def get_BaiDuPicExtract(url=""): from aip import AipOcr print(url) """ 你的 APPID AK SK """ APP_ID = '14709450' API_KEY = '0zw6HYjooiU1kmkynNGqSn4T' SECRET_KEY = 'u2Qag7hW2DnUkFz96on3FrDzLtVmwCDH' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) """读取本地图片函数""" def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() """ 如果有可选参数 """ options = dict() options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ r_json = client.basicGeneralUrl(url, options) """识别成功返回的json {'log_id': 1551369869026631207, 'direction': 0, 'words_result_num': 1, 'words_result': [{ 'words': '82-8=', 'probability': {'variance': 0.007439, 'average': 0.91145, 'min': 0.806046} }], 'language': -1} """ print(r_json) words_result = r_json.get("words_result", []) words = None if words_result: words = words_result[0].get("words", "") return words
# -*- coding: utf-8 -*- # @Time : 2018/4/11 16:14 # @Author : Shark # @File : verification_code.py # @Software : PyCharm from aip import AipOcr APP_ID = '10769158' API_KEY = 'C2Uo8OGGY1O1RVwGOkrb68it' SECRET_KEY = '1ldb72IgGBVhrmrjtC3jTi8iyWnU70OM' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) # 读取图片 def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() # options = {} # options["language_type"] = "ENG" image = get_file_content('picture/captcha.jpg') url = 'https://www.douban.com/misc/captcha?id=U5ivIQP8K6C4lUTSJY9DaeJg:en&size=s' print(client.basicGeneralUrl(url))
#此为百度Ai开发平台的云打码,识别图片正确率极低,仅为展示调用接口。 from aip import AipOcr """ 你的 APPID AK SK """ APP_ID = '你的APP_ID' API_KEY = '你的API_KEY' SECRET_KEY = '你的SECRET_KEY ' client = AipOcr(APP_ID, API_KEY, SECRET_KEY) url = "https://sam.huat.edu.cn:8443/selfservice/common/web/verifycode.jsp" """ 调用通用文字识别, 图片参数为远程url图片 """ client.basicGeneralUrl(url) """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ res = client.basicGeneralUrl(url, options) print(res)
""" 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为本地图片 """ client.basicGeneral(image, options) # url = "http//www.x.com/sample.jpg" url = "http://www.lmth2013.com/validatecode.aspx" """ 调用通用文字识别, 图片参数为远程url图片 """ web_acc=client.basicGeneralUrl(url); print(web_acc) # """ 如果有可选参数 """ # options = {} # options["language_type"] = "CHN_ENG" # options["detect_direction"] = "true" # options["detect_language"] = "true" # options["probability"] = "true" # # """ 带参数调用通用文字识别, 图片参数为远程url图片 """ # client.basicGeneralUrl(url, options)
def main(): book = xw.Book.caller() """爬取农业部数据""" today = datetime.date.today() #获取当月第一天 firstmonthday = datetime.datetime(today.year, today.month, 1) #获取当年第一天 firstday = datetime.datetime(today.year, 1, 1) oneday = datetime.timedelta(days=1) all = {} #存放所有爬取的网页链接,key为日期,value为链接 url = 'http://www.scs.moa.gov.cn/scxxfb/' #爬取主页 response = requests.get(url) content = response.content page = etree.HTML(content) data = page.find('.//div[@class="sj_e_tonzhi_list"]') for i in data: infos = i.findall('.//li') for info in infos: rrr = info.find('.//a') link = url + str(rrr.get('href')) date = re.findall(r'.\w+.t(\d+)\w+', link) all[date[0]] = str(link) for i in range(1, 13): url = 'http://www.scs.moa.gov.cn/scxxfb/index_' + str(i) + '.htm' response = requests.get(url) content = response.content page = etree.HTML(content) data = page.find('.//div[@class="sj_e_tonzhi_list"]') for i in data: infos = i.findall('.//li') for info in infos: rrr = info.find('.//a') link = 'http://www.scs.moa.gov.cn/scxxfb/' + str( rrr.get('href')) date = re.findall(r'.\w+.t(\d+)\w+', link) all[date[0]] = str(link) #print(all) #爬取目标页、正则提取猪肉价格 def price_get(link): response = requests.get(link) content = response.content page = etree.HTML(content) info = page.find('.//div[@class="TRS_Editor"]') text = info.find('.//p').text price = re.findall(r'猪肉\D+(\d+.\d+)元', text) return price price1 = {} #存放猪肉价格,key为日期,value为价格 #今天的价格,若未更新则为前一天价格 while today.strftime('%Y%m%d') not in all.keys(): today -= oneday else: d_p_price = price_get(str(all[today.strftime('%Y%m%d')])) price1[today.strftime('%Y%m%d')] = d_p_price #本月初价格,更新时间为本月第一个工作日 while firstmonthday.strftime('%Y%m%d') not in all.keys(): firstmonthday += oneday else: m_p_price = price_get(str(all[firstmonthday.strftime('%Y%m%d')])) price1[firstmonthday.strftime('%Y%m%d')] = m_p_price #本年初价格,更新时间为本年第一个工作日 while firstday.strftime('%Y%m%d') not in all.keys(): firstday += oneday else: y_p_price = price_get(str(all[firstday.strftime('%Y%m%d')])) price1[firstday.strftime('%Y%m%d')] = y_p_price #对应价格的列表 #l=[price1[today.strftime('%Y%m%d')],price1[firstmonthday.strftime('%Y%m%d')],price1[firstday.strftime('%Y%m%d')]] print(price1) ''' 最终结果是price1是一个字典 pric1e[today.strftime('%Y%m%d')]是今日价格 price1[firstmonthday.strftime('%Y%m%d')]是本月初 price1[firstday.strftime('%Y%m%d')]是本年初 ''' """爬取二元能繁母猪数据""" #百度云账号 APP_ID = '#####' API_KEY = '########' SECRECT_KEY = '########' client = AipOcr(APP_ID, API_KEY, SECRECT_KEY) #爬取主页,获取目标网页链接 url = 'http://sousuo.gov.cn/s.htm?q=%E4%BA%8C%E5%85%83%E6%AF%8D%E7%8C%AA%E9%94%80%E5%94%AE%E4%BB%B7%E6%A0%BC&t=govall&timetype=timeqb&mintime=&maxtime=&sort=pubtime&sortType=1&nocorrect=' response = requests.get(url) content = response.content page = etree.HTML(content) table = page.find('.//h3[@class="res-title"]') channels = table.find('.//a') link = channels.get('href') #print(link) #爬取最新公告的标题 html = requests.get(link) html.encoding = 'utf-8' text = html.text page1 = etree.HTML(text) info = page1.find('.//div[@class="article oneColumn pub_border"]') t = info.find('.//h1') title = t.text #print(title) #从公告标题中提取更新数据对应的日期 datestr = title[len(title) - 14:len(title) - 9] date = '2020年' + datestr date1 = datetime.datetime.strptime(date, '%Y年%m月%d日') #print(date1) #爬取公告中的图片 content1 = page1.find('.//div[@class="pages_content"]') channels1 = content1.find('.//img') link_img = channels1.get('src') links = str(link) pic_urls = links[:len(links) - 19] + link_img #调用百度api对图片进行文本识别,从中提取价格内容 prices = client.basicGeneralUrl(pic_urls) r = prices['words_result'] info = r[5] price = info['words'] #print(price) #更新每周二元母猪价格 pork_price = {} week = date1.strftime("%W") pork_price[week + '周'] = price #存放每周二元母猪价格,key为周数,对应价格 print(pork_price) """抓取wind数据 写入excel""" #链接到wind数据库 w.start() w.isconnected() #统计仔猪数据 ##download仔猪数据 pig_baby_codes = ['############'] ###仔猪代码已打码 pig_baby = w.edb(pig_baby_codes, datetime.date.today() + datetime.timedelta(days=-5), datetime.date.today(), usedf=True, ShowBlank=0) pig_baby = pig_baby[1] pig_baby.columns = ['###########'] ###仔猪地区标签已打码 ##分地区统计仔猪数据 pig_baby_mean = pd.DataFrame([]) pig_baby_mean_names = ['##########'] ###仔猪分地区统计的地区标签已打码 for i in range(1, 13, 2): pig_baby_mean[pig_baby_mean_names[int( (i - 1) / 2)]] = (pig_baby.iloc[:, i - 1] + pig_baby.iloc[:, i]) / 2 print(pig_baby_mean) #生猪 ##download生猪数据 pig_codes = ["###############"] ###生猪代码已打码 pig = w.edb(pig_codes, datetime.date.today() + datetime.timedelta(days=-4), datetime.date.today(), usedf=True, ShowBlank=0) pig = pig[1] pig.columns = ["###############"] ###生猪地区标签已打码 ##分地区统计仔猪数据 pig_mean = pd.DataFrame(np.zeros((4, 5))) pig_mean_names = ["###########"] ###生猪分地区统计的地区标签已打码 pig_mean.columns = pig_mean_names print(pig_mean) pig_mean.index = pig.index[1:] for name in pig_mean_names: i = 0 for n in list(pig.columns): if name in n: pig_mean[name] = pig_mean[name] + pig[n] i += 1 pig_mean[name] = pig_mean[name] / i print(pig_baby_mean) #统计玉米数据 ##donload玉米价格 corn_codes = ['S5005793'] corn = w.edb(corn_codes, datetime.date.today() + datetime.timedelta(days=-5), datetime.date.today(), usedf=True, ShowBlank=0) corn = corn[1] corn.columns = ['现货价:玉米:平均价'] corn = corn.T print(corn) #关闭Wind接口 w.stop() #仔猪、生猪、猪肉、玉米价格汇总 pig_baby_mean = pig_baby_mean.T pig_mean = pig_mean.T pig_baby_data = list(pig_baby_mean[pig_baby_mean.columns[-1]]) pig_baby_data.append(np.mean(pig_baby_data)) pig_data = list(pig_mean[pig_mean.columns[-1]]) pig_data.append(np.mean(pig_data)) corn_data = list(corn[corn.columns[-1]]) pig_baby_data.extend(pig_data) pig_baby_data.extend(corn_data) pig_baby_data.append(float(price1[today.strftime('%Y%m%d')][0])) alldata = pig_baby_data print(alldata) #最近5日日期的一个list——days是datetime格式列表,days1是字符格式列表 days = [ datetime.datetime.today() + datetime.timedelta(days=-i) for i in range(5) ] days1 = [days[i].strftime('%Y-%m-%d') for i in range(5)] days.reverse() days1.reverse() print(days) #最近五周的一个list——week_nows week_list = {} today = datetime.date.today() weeks = today.strftime("%W") week_n = int(weeks) week_list[week_n] = week_n l = [week_list[week_n] - i for i in range(5)] for i in range(5): l[i] = str(l[i]) + '周' l.reverse() print(l) week_nows = l #链接到目标表格 sht = book.sheets[0] #判断二元能繁母猪年度数据、月度数据是否要更新 firstday_week = datetime.datetime(datetime.date.today().year, datetime.date.today().month, 1).strftime("%W") + '周' if week_nows[-1] == '1周': sht.range('Q8').value = float(price) if week_nows[-1] == firstday_week: sht.range('P8').value = float(price) #判断仔猪、生猪、猪肉、玉米年度数据、月度数据是否要更新 if days1[-1][6:] == '01-01': sht.range('Q11:Q25').options(transpose=True).value = alldata if days1[-1][9:] == '01': sht.range('P11:P25').options(transpose=True).value = alldata #更新主体数据(若今天数据已更新则不再更) ##二元能繁母猪 if sht.range('K7').value == week_nows[-1]: pass else: sht.range('G8:J8').value = sht.range('H8:K8').value sht.range('K8').value = float(price) ##仔猪、生猪、猪肉、玉米 if sht.range('K9').value.date() == days[-1].date(): pass else: sht.range('G7:K7').value = week_nows sht.range('G9:K9').value = days1 sht.range('G11:J25').value = sht.range('H11:K25').value sht.range('K11:K25').options(transpose=True).value = alldata
class Ocr_tool(object): def __init__(self): self.client = AipOcr(APP_ID, API_KEY, SECRET_KEY) self.p_thres = 0.5 def get_file_content(self, filepath): with open(filepath, 'rb') as fp: return fp.read() def read_image(self, filepath): return Image.open(filepath) def crop_image(self, im): width, height = im.size im = im.crop((int(width * 627 / 2160), int(height * 531 / 1080), int(width * 1421 / 2160), int(height * 737 / 1080))) return im def show_im(self, im): im.show() def image2byte(self, im): out_buffer = BytesIO() im.save(out_buffer, format='PNG') byte_data = out_buffer.getvalue() return byte_data def ocr(self, image=None, url=None): options = {} options["language_type"] = "CHN_ENG" options["probability"] = "true" if image is not None: byte_data = self.image2byte(image) res = self.client.basicGeneral(byte_data, options) elif url is not None: res = self.client.basicGeneralUrl(url, options) else: return [] # print(res) word_lis = [] if "words_result" in res: for word_entry in res["words_result"]: if word_entry["probability"]["average"] < self.p_thres: continue word_lis.append(word_entry["words"]) return word_lis def get_im_from_url(self, imgurl): r = requests.get(imgurl, timeout=30) buffer = r.content im = Image.open(BytesIO(buffer)) return im def get_tags_from_url(self, imgurl, crop=False): if crop: im = self.get_im_from_url(imgurl) if im.mode == 'P': print("MYDEBUG mode:P") return [] im = self.crop_image(im) tag_lis = self.ocr(image=im) else: tag_lis = self.ocr(url=imgurl) return tag_lis
# """ 调用通用文字识别, 图片参数为本地图片 """ # client.basicGeneral(image); # # """ 如果有可选参数 """ # options = {} # options["language_type"] = "CHN_ENG" # options["detect_direction"] = "true" # options["detect_language"] = "true" # options["probability"] = "true" # # """ 带参数调用通用文字识别, 图片参数为本地图片 """ # client.basicGeneral(image, options) url = "https://wx2.sinaimg.cn/mw690/006NGRWIly1fwz7o2gugij30u00ez419.jpg" """ 调用通用文字识别, 图片参数为远程url图片 """ client.basicGeneralUrl(url) """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ ocr_result = client.basicGeneralUrl(url, options) words_result = ocr_result['words_result'] result = '' for words in words_result: result = result + '\n' + words['words']
""" 调用通用文字识别, 图片参数为本地图片 """ resp = client.basicGeneral(image); print('resp1', resp) print('---------------------------------------------') """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为本地图片 """ client.basicGeneral(image, options) url = "https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2844325179,1671562938&fm=26&gp=0.jpg" """ 调用通用文字识别, 图片参数为远程url图片 """ client.basicGeneralUrl(url); """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ resp = client.basicGeneralUrl(url, options) print('resp2', resp)
""" 你的 APPID AK SK """ APP_ID_OCR = '22855479' APP_ID_SPEECH = '22844737' API_KEY_OCR = 'dEbfdWGaDhu7yG4h07OMaSU3' API_KEY_SPEECH = 'Gc4Vtsvw3dpjxjuEpCrFlq8d' SECRET_KEY_OCR = 'V0hD45LqGugfCnZe9eNb6ih5cp5d7Xj4' SECRET_KEY_SPEECH = 'mBbpvR3tA7wm561dtmchP5MMjPsVnGt4' client_ocr = AipOcr(APP_ID_OCR, API_KEY_OCR, SECRET_KEY_OCR) client_speech = AipSpeech(APP_ID_SPEECH, API_KEY_SPEECH, SECRET_KEY_SPEECH) url = "https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=2844325179,1671562938&fm=26&gp=0.jpg" """ 调用通用文字识别, 图片参数为远程url图片 """ client_ocr.basicGeneralUrl(url) """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ resp = client_ocr.basicGeneralUrl(url, options) words_result = resp['words_result'] words = list(map(lambda record: record['words'], words_result)) # 文本转音频 result = client_speech.synthesis(','.join(words), 'zh', 1, { 'vol': 5, })
# encoding:utf-8 import requests from aip import AipNlp from aip import AipOcr """ 你的 APPID AK SK """ APP_ID = '17925688' API_KEY = 'Ra8Mg2MLx2a8E9hs6BKrBp3D' SECRET_KEY = 'VPeAIciWGBEqQcXs1lDNdY5rdubRoBaG' clientOcr = AipOcr(APP_ID, API_KEY, SECRET_KEY) clientNlp = AipNlp(APP_ID, API_KEY, SECRET_KEY) url = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1575441285116&di=54ac097a7c11ff5c211ad13d788211a4&imgtype=0&src=http%3A%2F%2Fphotocdn.sohu.com%2F20131025%2FImg388882088.jpg" """ 如果有可选参数 """ options = {} options["language_type"] = "CHN_ENG" options["detect_direction"] = "true" options["detect_language"] = "true" options["probability"] = "true" """ 带参数调用通用文字识别, 图片参数为远程url图片 """ text = clientOcr.basicGeneralUrl(url, options) print(text) """ 调用词法分析 """ #print(clientNlp.lexer(text))