def login(): url='https://www.douban.com/accounts/login' formdata = { "source": "None", "form_email": "15817161961", "redir":"https://www.douban.com", "form_password": "******", "captcha-solution": "brother", # 验证码 "captcha-id": "LaWfXy8orhmAw5XhjVtF2hQs:en", # 验证码图片id "remember": "on" } response = post_http(url, formdata) res_tr = r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"/>' img_link = re.findall(res_tr, str(response), re.S | re.M)[0] print img_link #response = get_file('https://www.douban.com') #print response #res_tr = r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"' #img_link = re.findall(res_tr, str(response), re.S | re.M)[0] #print img_link save_file("../../../../img", "captcha.jpg", get_file(img_link)) print tesserocr.file_to_text("/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg") print Image.open('/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg') print pytesseract.image_to_string(Image.open('/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg'))
def write_json_to_file(): file_path = './images/images_pdf1/' f = open('out.json', 'w+') write_array = [] pageID = 1 for i in range(2, 60): file = file_path + "image (" + str(i) + ").jpg" text = tess.file_to_text(file, lang='eng', psm=tess.PSM.AUTO, path='tessdata-master/') array = [] jsn = {} try: array = parse_file(text) except Exception as inst: pass finally: jsn['page_id'] = str(pageID) pageID += 1 jsn['page_name'] = getPageName(text) jsn['page_info'] = array write_array.append(jsn) f.write(json.dumps(write_array)) f.close()
def generate_text_from_file(self, tessDataPath='tessdata-master/'): text = tess.file_to_text(self.imagePath, lang='eng', psm=tess.PSM.AUTO, path=tessDataPath) self.text = text return text
def test_image_file(self): """Test SetImageFile and GetUTF8Text.""" self._api.SetImageFile(self._image_file) text = self._api.GetUTF8Text() self.assertIn("quick", text) text2 = tesserocr.file_to_text(self._image_file) self.assertEqual(text, text2)
def test_image_file(self): """Test SetImageFile and GetUTF8Text.""" self._api.Init() self._api.SetImageFile(self._image_file) text = self._api.GetUTF8Text() self.assertIn('quick', text) text2 = tesserocr.file_to_text(self._image_file) self.assertEqual(text, text2)
def ocr_remote_link(): url = "https://raw.githubusercontent.com/Python3WebSpider/CrackImageCode/master/code2.jpg" resp = requests.get(url) if codes.ok == resp.status_code: file_name = 'pic/' + str(md5(resp.content).hexdigest()) + '.jpg' with open(file_name, 'wb') as f: f.write(resp.content) print(tesserocr.file_to_text(file_name))
def same_old_solve(): """ 识别率依旧很低, 不提了... """ for i in range(10001): picture_path = "cap1/im{}.png".format(i) picture = Image.open(picture_path) result1 = tesserocr.image_to_text(picture.convert('L'), lang='eng') result2 = tesserocr.file_to_text(picture_path, psm=tesserocr.PSM.AUTO) print(result1) print(result2)
def predict(file_name): predictions = recognize_keras(file_name) tesser_res = tesserocr.file_to_text(file_name) keras_text = [] # coords = [] for idx, prediction in enumerate(predictions): for word, array in prediction: keras_text.append(word) # coords.append(array.tolist()) response = ' '.join(keras_text) + ' ' + ' '.join(tesser_res.split()) os.remove(file_name) return response
def batch_rec(img_path): # Recognize all the text that present in the path folder, and store the result in a txt time which # has the same name with the image if not os.path.exists(img_path): return dirs = os.listdir(img_path) for img in dirs: print("predicting " + img) try: content = tesserocr.file_to_text(img_path + img) except Exception: print("skip", img) continue # content = tesserocr.file_to_text(img_path+img) name, _ = img.split(".") with open(img_path + name + '_rec.txt', 'w') as file: file.write(content)
def code_tests_ocr(images, version): if version is 1: print("CODE EXAMPLE 1\n") with PyTessBaseAPI() as api: for img in images: api.SetImageFile(img) print('------------------------------------------------------') print(img.capitalize()) print() print(api.GetUTF8Text()) print(api.AllWordConfidences()) print() elif version is 2: print("CODE EXAMPLE 2\n") for img in images: print('------------------------------------------------------') print(img.capitalize()) print() print(tesserocr.file_to_text(img)) print() else: print('WRONG CODE VERSION ERROR')
# 测试 tesserocr 是否安装成功 import tesserocr print(tesserocr.file_to_text('../image/OCRTest.png'))
def checkCaptcha(img): r = tesserocr.file_to_text('./rawCaptcha/0B7D.png') print('r=', r)
import tesserocr as tc from PIL import Image ''' 通过添加字体库支持新的语言和字体 C:\\Users\\admin\\AppData\\Local\\Programs\\Python\\Python37-32\\/tessdata/' ''' class OcrTools: def __init__(self): demo = '' print(tc.tesseract_version()) # print tesseract-ocr version print(tc.get_languages()) # prints tessdata path and list of available languages filename = 'data/news.png' en_filename = 'data/testp.png' image = Image.open(filename) #print(tc.image_to_text(image)) # print ocr text from image # or #标准中文图片 print('---------------------标准中文图片---------------------') print(tc.file_to_text(filename,lang='chi_sim')) #标准英文图片 print('---------------------标准英文图片---------------------') print(tc.file_to_text(en_filename))
import tesserocr from PIL import Image image = Image.open('./images/code.jpg') r = tesserocr.image_to_text(image) print(r) print(tesserocr.file_to_text('./images/code.jpg')) print(tesserocr.file_to_text('./images/image.png')) img = image.convert('L') # img.show() threshold = 110 table = [] for i in range(256): if i< threshold: table.append(0) else : table.append(1) image = img.point(table, '1') # image.show() res = tesserocr.image_to_text(image) print(res)
import tesserocr for i in range(1, 6): str = 'code{}.jpg'.format(i) #这次使用的是file_to_text()方法 result = tesserocr.file_to_text(str).strip() print(result)
# from selenium import webdriver # driver = webdriver.Edge('msedgedriver.exe') # driver = webdriver.PhantomJS() # driver.get('https://www.baidu.com') # print(driver.current_url) import tesserocr from PIL import Image image=Image.open('1.png') print(tesserocr.image_to_text(image)) print(tesserocr.file_to_text('1.png'))
import os import time import pytesseract from PIL import Image startTime = time.time() filePath = input('Enter file path (e.g. \'C:/sampleImages/\'): ') files = os.listdir(filePath) for file in files: if file.endswith('jpg'): print(file) fileName = file[:file.rindex('.')] imageFile = os.path.join(filePath, file) #first OCR method text = tesserocr.file_to_text(imageFile) ##second OCR method #text = pytesseract.image_to_string(Image.open(imageFile)) f=(open(filePath+fileName+'.txt', 'w')) f.write(text) print('text file created') else: print(file, ' - skipped') elapsedTime = time.time() - startTime m, s = divmod(elapsedTime, 60) h, m = divmod(m, 60) print('Total script run time: ','%d:%02d:%02d' % (h, m, s))
import tesserocr from PIL import Image result = tesserocr.file_to_text('Code.jpg') print(result)
def ocr_redirect(): # 识别概率不高,容易受到线条、模糊化干扰 print(tesserocr.file_to_text("pic/code.jpg")) image = Image.open("pic/code2.jpg") print(tesserocr.image_to_text(image))
def get_text(filepath): return tesserocr.file_to_text(filepath, lang='sqi')
import tesserocr from PIL import Image import pytesseract if __name__ == '__main__': image = Image.open('C:/Users/lijiafei/Desktop/verifycode/verfycode.jpg') # print(tesserocr.image_to_text(image)) # print ocr text from image # or print( tesserocr.file_to_text( 'C:/Users/lijiafei/Desktop/verifycode/verfycode.jpg')) print(pytesseract.image_to_string(image), 1)
# -*- coding: UTF-8 -*- import tesserocr from PIL import Image image = Image.open("/Users/gaxiong/Downloads/image.png") print(tesserocr.image_to_text(image)) print(tesserocr.file_to_text("/Users/gaxiong/Downloads/image.png"))
import tesserocr from PIL import Image image = Image.open('test.jpeg') result = tesserocr.image_to_text(image) print(result) print(tesserocr.file_to_text('test.jpeg'))
import tesserocr from PIL import Image # 获取 tesserocr 版本信息 print(tesserocr.tesseract_version()) print(tesserocr.get_languages()) image = Image.open('CheckCode.jpg') print(tesserocr.image_to_text(image)) # print ocr text from image print(tesserocr.file_to_text('CheckCode.jpg'))
cv2.imwrite(filename, img) return img, filename ''' ''' plt.figure('licencePlate') #图名 plt.imshow(image_gray,cmap='gray') #cmap即colormap,颜色映射 plt.axis('on') #关闭网格线 plt.show() ''' #image_gray.save('jin2.jpg') #box=(70,65,590,190) #region = image_black_white.crop(box) #region.show() #region.save('cutoff.jpg') #print(tesserocr.image_to_text('cutoff.jpg')) print(tesserocr.file_to_text('bchuan.test.exp0.jpg')) class languages: CHS = 'chi_sim' CHT = 'chi_tra' ENG = 'eng' NA = 'num' def img_to_str(image_path, lang): return pytesseract.image_to_string(Image.open(image_path), lang) def save(text): fs = open("../OCR/gg_ocr.txt", 'w+', encoding='utf-8') # 遍历后的图片提取文字,保存到txt
print api.AllWordConfidences() # api is automatically finalized when used in a with-statement (context manager). # otherwise api.End() should be explicitly called when it's no longer needed. #Basic Usage import tesserocr from PIL import Image print tesserocr.tesseract_version() # print tesseract-ocr version print tesserocr.get_languages( ) # prints tessdata path and list of available languages image = Image.open('sample.jpg') print tesserocr.image_to_text(image) # print ocr text from image # or print tesserocr.file_to_text('sample.jpg') #Advanced API from PIL import Image from tesserocr import PyTessBaseAPI image = Image.open('/usr/src/tesseract/testing/phototest.tif') with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages(RIL.TEXTLINE, True) print 'Found {} textline image components.'.format(len(boxes)) for i, (im, box, _, _) in enumerate(boxes): # im is a PIL image object # box is a dict with x, y, w and h keys api.SetRectangle(box['x'], box['y'], box['w'], box['h']) ocrResult = api.GetUTF8Text()
# File : test_for_packages.py # IDE : PyCharm import tesserocr from PIL import Image from selenium import webdriver from bs4 import BeautifulSoup # Part 1: test selenium # prepare the option for the chrome driver options = webdriver.ChromeOptions() options.add_argument('headless') # start chrome browser browser = webdriver.Chrome(chrome_options=options) browser.get('https://www.baidu.com') print(browser.current_url) browser.quit() # Part 2: test beautifulsoup4 soup = BeautifulSoup('<p>Hello<p>', 'lxml') print(soup.p.string) # Part 3: test tesserocr # method one: use image_to_text image = Image.open('chapter_1/python3webspider.png') print(tesserocr.image_to_text(image)) # method two: use file_to_text print(tesserocr.file_to_text('chapter_1/python3webspider.png'))
from selenium import webdriver import tesserocr from PIL import Image if __name__ == "__main__": #api = tesserocr.PyTessBaseAPI(path=r'C:\Program Files (x86)\Tesseract-OCR\tessdata') #browser = webdriver.Chrome() #browser = webdriver.Firefox() #browser = webdriver.PhantomJS(executable_path=r'E:\phantomjs\bin\phantomjs.exe') #browser.get('https:www.baidu.com') #print(browser.current_url) image = Image.open('image.png') print(tesserocr.image_to_text(image)) print(tesserocr.file_to_text('image.png'))
import tesserocr from PIL import Image image = Image.open('image.png') print(tesserocr.file_to_text(image))
#coding:utf-8 ''' 图片验证码 ''' import tesserocr from PIL import Image image = Image.open('D:\\exercise\\code.jpg') #先打开图片 result = tesserocr.image_to_text(image) #imge_to_text()将图片转化为文字 print(result) print(tesserocr.file_to_text('D:\\exercise\\code.jpg')) #file_to_text也行,但不稳定 image2 = Image.open('D:\\exercise\\code2.jpg') result = tesserocr.image_to_text(image2) print(result) image2 = image2.convert('L') # 使用convert()方法传入L,将图片灰度处理 image2 = image2.convert('1') # 使用convert()方法传入1,将图片二值处理 image2.show() image = image.convert('L') threshold = 127 # 二值化的阈值 table = [] # 这个没懂,怀疑是添加rgb的的参数? for i in range(256): if i < threshold: table.append(0) else: table.append(1) print('table>>>>>', table)
import tesserocr from PIL import Image image = Image.open('code.jpg') result = tesserocr.image_to_text(image) print(result) # tesserocr另一个简单的方法,将图片文件转为字符串 print(tesserocr.file_to_text('code.jpg'))
import tesserocr from PIL import Image image = Image.open('images/template.png') print(tesserocr.image_to_text(image, lang='chi_sim')) print(tesserocr.file_to_text('images/test.jpg')) #灰度与二值化处理 image = Image.open('images/CheckCode.jpg') image = image.convert('L') threshold = 127 #阈值 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) image = image.point(table, '1') image.show() print(tesserocr.image_to_text(image))
import tesserocr print(tesserocr.file_to_text('code.jpg'))