示例#1
0
def login():
    url='https://www.douban.com/accounts/login'
    formdata = {
        "source": "None",
        "form_email": "15817161961",
        "redir":"https://www.douban.com",
        "form_password": "******",
        "captcha-solution": "brother", # 验证码
        "captcha-id": "LaWfXy8orhmAw5XhjVtF2hQs:en", # 验证码图片id
        "remember": "on"
    }
    response = post_http(url, formdata)
    res_tr = r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"/>'
    img_link = re.findall(res_tr, str(response), re.S | re.M)[0]
    print img_link

    #response = get_file('https://www.douban.com')
    #print response
    #res_tr = r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"'
    #img_link = re.findall(res_tr, str(response), re.S | re.M)[0]
    #print img_link
    save_file("../../../../img", "captcha.jpg", get_file(img_link))
    print tesserocr.file_to_text("/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg")
    print Image.open('/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg')
    print pytesseract.image_to_string(Image.open('/home/howell/PycharmProjects/luckwine-spider/img/captcha.jpg'))
示例#2
0
def write_json_to_file():
    file_path = './images/images_pdf1/'
    f = open('out.json', 'w+')
    write_array = []
    pageID = 1
    for i in range(2, 60):
        file = file_path + "image (" + str(i) + ").jpg"
        text = tess.file_to_text(file,
                                 lang='eng',
                                 psm=tess.PSM.AUTO,
                                 path='tessdata-master/')
        array = []
        jsn = {}
        try:
            array = parse_file(text)
        except Exception as inst:
            pass
        finally:
            jsn['page_id'] = str(pageID)
            pageID += 1
            jsn['page_name'] = getPageName(text)
            jsn['page_info'] = array
            write_array.append(jsn)

    f.write(json.dumps(write_array))
    f.close()
示例#3
0
 def generate_text_from_file(self, tessDataPath='tessdata-master/'):
     text = tess.file_to_text(self.imagePath,
                              lang='eng',
                              psm=tess.PSM.AUTO,
                              path=tessDataPath)
     self.text = text
     return text
示例#4
0
 def test_image_file(self):
     """Test SetImageFile and GetUTF8Text."""
     self._api.SetImageFile(self._image_file)
     text = self._api.GetUTF8Text()
     self.assertIn("quick", text)
     text2 = tesserocr.file_to_text(self._image_file)
     self.assertEqual(text, text2)
示例#5
0
 def test_image_file(self):
     """Test SetImageFile and GetUTF8Text."""
     self._api.Init()
     self._api.SetImageFile(self._image_file)
     text = self._api.GetUTF8Text()
     self.assertIn('quick', text)
     text2 = tesserocr.file_to_text(self._image_file)
     self.assertEqual(text, text2)
示例#6
0
def ocr_remote_link():
    url = "https://raw.githubusercontent.com/Python3WebSpider/CrackImageCode/master/code2.jpg"
    resp = requests.get(url)
    if codes.ok == resp.status_code:
        file_name = 'pic/' + str(md5(resp.content).hexdigest()) + '.jpg'
        with open(file_name, 'wb') as f:
            f.write(resp.content)
        print(tesserocr.file_to_text(file_name))
示例#7
0
def same_old_solve():
    """
    识别率依旧很低, 不提了...
    """
    for i in range(10001):
        picture_path = "cap1/im{}.png".format(i)
        picture = Image.open(picture_path)
        result1 = tesserocr.image_to_text(picture.convert('L'), lang='eng')
        result2 = tesserocr.file_to_text(picture_path, psm=tesserocr.PSM.AUTO)
        print(result1)
        print(result2)
示例#8
0
def predict(file_name):

    predictions = recognize_keras(file_name)
    tesser_res = tesserocr.file_to_text(file_name)

    keras_text = []
    # coords = []
    for idx, prediction in enumerate(predictions):
        for word, array in prediction:
            keras_text.append(word)
            # coords.append(array.tolist())
    
    response = ' '.join(keras_text) + ' ' + ' '.join(tesser_res.split()) 

    os.remove(file_name)

    return response
示例#9
0
def batch_rec(img_path):
    # Recognize all the text that present in the path folder, and store the result in a txt time which
    # has the same name with the image
    if not os.path.exists(img_path):
        return

    dirs = os.listdir(img_path)
    for img in dirs:
        print("predicting " + img)
        try:
            content = tesserocr.file_to_text(img_path + img)
        except Exception:
            print("skip", img)
            continue

        # content = tesserocr.file_to_text(img_path+img)

        name, _ = img.split(".")
        with open(img_path + name + '_rec.txt', 'w') as file:
            file.write(content)
示例#10
0
def code_tests_ocr(images, version):
    if version is 1:
        print("CODE EXAMPLE 1\n")
        with PyTessBaseAPI() as api:
            for img in images:
                api.SetImageFile(img)
                print('------------------------------------------------------')
                print(img.capitalize())
                print()
                print(api.GetUTF8Text())
                print(api.AllWordConfidences())
                print()
    elif version is 2:
        print("CODE EXAMPLE 2\n")
        for img in images:
            print('------------------------------------------------------')
            print(img.capitalize())
            print()
            print(tesserocr.file_to_text(img))
            print()
    else:
        print('WRONG CODE VERSION ERROR')
示例#11
0
# 测试 tesserocr 是否安装成功
import tesserocr
print(tesserocr.file_to_text('../image/OCRTest.png'))
示例#12
0
def checkCaptcha(img):
    r = tesserocr.file_to_text('./rawCaptcha/0B7D.png')
    print('r=', r)
import tesserocr as tc
from PIL import Image

'''
通过添加字体库支持新的语言和字体
C:\\Users\\admin\\AppData\\Local\\Programs\\Python\\Python37-32\\/tessdata/'
'''

class OcrTools:
    def __init__(self):
        demo  = ''

print(tc.tesseract_version())  # print tesseract-ocr version
print(tc.get_languages())  # prints tessdata path and list of available languages

filename = 'data/news.png'

en_filename = 'data/testp.png'

image = Image.open(filename)

#print(tc.image_to_text(image))  # print ocr text from image
# or
#标准中文图片
print('---------------------标准中文图片---------------------')
print(tc.file_to_text(filename,lang='chi_sim'))
#标准英文图片
print('---------------------标准英文图片---------------------')
print(tc.file_to_text(en_filename))
示例#14
0
import tesserocr
from PIL import Image

image = Image.open('./images/code.jpg')
r = tesserocr.image_to_text(image)
print(r)

print(tesserocr.file_to_text('./images/code.jpg'))

print(tesserocr.file_to_text('./images/image.png'))

img = image.convert('L')
# img.show()
threshold = 110
table = []
for i in range(256):
    if i< threshold:
        table.append(0)
    else :
        table.append(1)
image = img.point(table, '1')
# image.show()
res = tesserocr.image_to_text(image)
print(res)


示例#15
0
import tesserocr

for i in range(1, 6):
    str = 'code{}.jpg'.format(i)
    #这次使用的是file_to_text()方法
    result = tesserocr.file_to_text(str).strip()
    print(result)
示例#16
0
# from selenium import webdriver
# driver = webdriver.Edge('msedgedriver.exe')
# driver = webdriver.PhantomJS()
# driver.get('https://www.baidu.com')
# print(driver.current_url)

import tesserocr
from PIL import Image
image=Image.open('1.png')
print(tesserocr.image_to_text(image))

print(tesserocr.file_to_text('1.png'))
import os
import time
import pytesseract
from PIL import Image

startTime = time.time()
filePath = input('Enter file path (e.g. \'C:/sampleImages/\'): ')
files = os.listdir(filePath)
for file in files:
    if file.endswith('jpg'):
        print(file)
        fileName = file[:file.rindex('.')]
        imageFile = os.path.join(filePath, file)

        #first OCR method
        text = tesserocr.file_to_text(imageFile)

        ##second OCR method
        #text = pytesseract.image_to_string(Image.open(imageFile))

        f=(open(filePath+fileName+'.txt', 'w'))
        f.write(text)
        print('text file created')
    else:
        print(file, ' - skipped')

elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
print('Total script run time: ','%d:%02d:%02d' % (h, m, s))
示例#18
0
import tesserocr
from PIL import Image

result = tesserocr.file_to_text('Code.jpg')
print(result)
示例#19
0
def ocr_redirect():
    # 识别概率不高,容易受到线条、模糊化干扰
    print(tesserocr.file_to_text("pic/code.jpg"))

    image = Image.open("pic/code2.jpg")
    print(tesserocr.image_to_text(image))
示例#20
0
def get_text(filepath):
    return tesserocr.file_to_text(filepath, lang='sqi')
示例#21
0
文件: utils.py 项目: BoyOoka/api_test
import tesserocr
from PIL import Image
import pytesseract

if __name__ == '__main__':
    image = Image.open('C:/Users/lijiafei/Desktop/verifycode/verfycode.jpg')
    # print(tesserocr.image_to_text(image))  # print ocr text from image
    # or
    print(
        tesserocr.file_to_text(
            'C:/Users/lijiafei/Desktop/verifycode/verfycode.jpg'))
    print(pytesseract.image_to_string(image), 1)
示例#22
0
# -*- coding: UTF-8 -*-

import tesserocr

from PIL import Image

image = Image.open("/Users/gaxiong/Downloads/image.png")
print(tesserocr.image_to_text(image))

print(tesserocr.file_to_text("/Users/gaxiong/Downloads/image.png"))
示例#23
0
import tesserocr
from PIL import Image

image = Image.open('test.jpeg')
result = tesserocr.image_to_text(image)
print(result)
print(tesserocr.file_to_text('test.jpeg'))
示例#24
0
import tesserocr

from PIL import Image

# 获取 tesserocr 版本信息
print(tesserocr.tesseract_version())

print(tesserocr.get_languages())

image = Image.open('CheckCode.jpg')

print(tesserocr.image_to_text(image))
# print ocr text from image

print(tesserocr.file_to_text('CheckCode.jpg'))
示例#25
0
    cv2.imwrite(filename, img)
    return img, filename
'''
'''
plt.figure('licencePlate')  #图名
plt.imshow(image_gray,cmap='gray') #cmap即colormap,颜色映射
plt.axis('on')  #关闭网格线
plt.show()
'''
#image_gray.save('jin2.jpg')
#box=(70,65,590,190)
#region = image_black_white.crop(box)
#region.show()
#region.save('cutoff.jpg')
#print(tesserocr.image_to_text('cutoff.jpg'))
print(tesserocr.file_to_text('bchuan.test.exp0.jpg'))


class languages:
    CHS = 'chi_sim'
    CHT = 'chi_tra'
    ENG = 'eng'
    NA = 'num'


def img_to_str(image_path, lang):
    return pytesseract.image_to_string(Image.open(image_path), lang)


def save(text):
    fs = open("../OCR/gg_ocr.txt", 'w+', encoding='utf-8')  # 遍历后的图片提取文字,保存到txt
        print api.AllWordConfidences()
# api is automatically finalized when used in a with-statement (context manager).
# otherwise api.End() should be explicitly called when it's no longer needed.

#Basic Usage
import tesserocr
from PIL import Image

print tesserocr.tesseract_version()  # print tesseract-ocr version
print tesserocr.get_languages(
)  # prints tessdata path and list of available languages

image = Image.open('sample.jpg')
print tesserocr.image_to_text(image)  # print ocr text from image
# or
print tesserocr.file_to_text('sample.jpg')

#Advanced API
from PIL import Image
from tesserocr import PyTessBaseAPI

image = Image.open('/usr/src/tesseract/testing/phototest.tif')
with PyTessBaseAPI() as api:
    api.SetImage(image)
    boxes = api.GetComponentImages(RIL.TEXTLINE, True)
    print 'Found {} textline image components.'.format(len(boxes))
    for i, (im, box, _, _) in enumerate(boxes):
        # im is a PIL image object
        # box is a dict with x, y, w and h keys
        api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
        ocrResult = api.GetUTF8Text()
# File  : test_for_packages.py
# IDE   : PyCharm

import tesserocr
from PIL import Image
from selenium import webdriver
from bs4 import BeautifulSoup

# Part 1: test selenium
# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')

# start chrome browser
browser = webdriver.Chrome(chrome_options=options)
browser.get('https://www.baidu.com')
print(browser.current_url)
browser.quit()

# Part 2: test beautifulsoup4
soup = BeautifulSoup('<p>Hello<p>', 'lxml')
print(soup.p.string)

# Part 3: test tesserocr
# method one: use image_to_text
image = Image.open('chapter_1/python3webspider.png')
print(tesserocr.image_to_text(image))

# method two: use file_to_text
print(tesserocr.file_to_text('chapter_1/python3webspider.png'))
示例#28
0
from selenium import webdriver
import tesserocr
from PIL import Image

if __name__ == "__main__":
    #api = tesserocr.PyTessBaseAPI(path=r'C:\Program Files (x86)\Tesseract-OCR\tessdata')
    #browser = webdriver.Chrome()
    #browser = webdriver.Firefox()
    #browser = webdriver.PhantomJS(executable_path=r'E:\phantomjs\bin\phantomjs.exe')
    #browser.get('https:www.baidu.com')
    #print(browser.current_url)
    image = Image.open('image.png')
    print(tesserocr.image_to_text(image))
    print(tesserocr.file_to_text('image.png'))
示例#29
0
import tesserocr
from PIL import Image
image = Image.open('image.png')
print(tesserocr.file_to_text(image))
示例#30
0
#coding:utf-8
'''
图片验证码
'''

import tesserocr
from PIL import Image

image = Image.open('D:\\exercise\\code.jpg')  #先打开图片
result = tesserocr.image_to_text(image)  #imge_to_text()将图片转化为文字
print(result)
print(tesserocr.file_to_text('D:\\exercise\\code.jpg'))  #file_to_text也行,但不稳定

image2 = Image.open('D:\\exercise\\code2.jpg')
result = tesserocr.image_to_text(image2)
print(result)

image2 = image2.convert('L')  # 使用convert()方法传入L,将图片灰度处理
image2 = image2.convert('1')  # 使用convert()方法传入1,将图片二值处理
image2.show()

image = image.convert('L')
threshold = 127  # 二值化的阈值
table = []
# 这个没懂,怀疑是添加rgb的的参数?
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)
print('table>>>>>', table)
示例#31
0
import tesserocr
from PIL import Image

image = Image.open('code.jpg')
result = tesserocr.image_to_text(image)
print(result)
# tesserocr另一个简单的方法,将图片文件转为字符串
print(tesserocr.file_to_text('code.jpg'))
示例#32
0
import tesserocr
from PIL import Image
image = Image.open('images/template.png')
print(tesserocr.image_to_text(image, lang='chi_sim'))
print(tesserocr.file_to_text('images/test.jpg'))

#灰度与二值化处理
image = Image.open('images/CheckCode.jpg')
image = image.convert('L')
threshold = 127  #阈值
table = []
for i in range(256):
    if i < threshold:
        table.append(0)
    else:
        table.append(1)
image = image.point(table, '1')
image.show()
print(tesserocr.image_to_text(image))
示例#33
0
import tesserocr

print(tesserocr.file_to_text('code.jpg'))