def getCheckCodeString(self, filename): text = str(ocr.image_file_to_string(filename)) print("验证码: " + text) text = text.strip() # 去除两边空格!!! os.remove(filename) # 删除验证码 return text
def executar(self, diretorio): try: self.__texto = image_file_to_string(diretorio) return True except: print() return False
def image_file_to_string(file): cwd = os.getcwd() try : os.chdir("C:\Users\MrLevo\Anaconda2\Lib") return pytesser.image_file_to_string(file) finally: os.chdir(cwd)
def getAuthCode(fn): #read the file im = Image.open(fn) # change the image size nx, ny = im.size width = int(nx*7) height = int(ny*7) im2 = im.resize((width, height), Image.BICUBIC) #if the pixel value is close less than RGB value (90, 136, 100) # convert it to black pixel (0, 0, 0), # otherwise, white pixel (255, 255, 255) pix = im2.load() for x in range(width): for y in range(height): (r, g, b) = pix[x, y] if (r < 90 and g < 136 and b < 100 ): pix[x, y] = (0, 0, 0) else: pix[x, y] = (255, 255, 255) #save the image im2.save("bw.bmp") # OCR pattern = re.compile(r'[\s\n]+') result= pytesser.image_file_to_string('bw.bmp').strip().upper() result= pattern.sub('', result).upper() #print result return result
def read_imagetext(self): im = PIL.Image.open('E:\study\oooo.png') # text = pytesser.image_to_string(im) # print "Using image_to_string(): " # print text text = pytesser.image_file_to_string('E:\study\oooo.png', graceful_errors=True) print "Using image_file_to_string():" print text
def getRand(self): try: fname = 'ttt0.jpg' rand = pytesser.image_file_to_string(fname).strip() self.rand = rand pattern_check = re.compile('[0-9]{4}', re.S) check = re.search(pattern_check, self.rand) if check: return self.rand else: return None except IOError, e: print '>>getRand failed' return None
def textdet(shot,q,mode): sent=[] d=enchant.Dict("en_US") prev="" while True: if shot.empty()==False: imgarray=shot.get() shot.queue.clear() cv2.imshow('retrieved',imgarray) cv2.imwrite("image.jpeg" , imgarray) text = pytesser.image_file_to_string("image.jpeg") text=text.replace("\n","") ## text=text.replace(" ","") for x in text: if not re.match("[a-zA-Z]",x): text=text.replace(x,"") valid=checkvalid(text) if text!="" and valid==True: if d.check(text)==True: if text.lower()!=prev: print (text), if mode==1: q.put(text) elif mode==2: sent.append(text) ## if text[0].isupper(): print sent translater(sent) del sent[:] prev=text.lower() ## else: ## rep=SpellingReplacer() ## text_corrected=rep.replace(text) ## if d.check(text_corrected)==True: ## print "corrected text = ",text_corrected ## if mode==1: ## q.put(text) ## if mode==2: ## sent.append(text) ## print sent ## translater(sent) ## del sent[:] k = cv2.waitKey(5) & 0xFF if k == 27: break cv2.destroyAllWindows()
def tocodes(fname): codes = pytesser.image_file_to_string(fname).strip() print 'The ttt' + str(i) + '.jpg\'s code is ' + codes return codes
def process_image(url): # image = _get_image(url) # image.filter(ImageFilter.SHARPEN) # return pytesseract.image_to_string(image) print "URL", url return pytesser.image_file_to_string(url)
else: buf = list(); elif(len(buf) <= 1 and firsth != 0): buf = list() """ for h in xrange(height): firstw = 0; for w in xrange(width): r,g,b = whitePix[w,h]; tcounter = 0; for pw in xrange(-1,2): for ph in xrange(-1,2): if((w - pw) < 0 or (h - ph) < 0 or (w + pw) >= (width-2) or (h - ph) >= (height-2)): break; tr,tg,tb = whitePix[w - pw,h - ph]; if(tr <= limiar and tg <= limiar and tb <= limiar): #whitePix[w - pw,h - ph] = (0,0,0); tcounter+=1; if(tcounter <= 2): whitePix[w,h] = (255,255,255); imgList[0].save('images/' + fname); imgList[1].save('images/negative_' + fname); imgList[2].save('images/white_' + fname); text = pytesser.image_file_to_string('images/white_' + fname,graceful_errors=True) print(text)
valimg = None valimgs = tree.xpath('//img[@id="yanzheng"]/@src') if len(valimgs) > 0: valimg = valimgs[0] validateCode = None if valimg: fname = 'img/' + str(idx) + '_' + str(config['gid']) + '.jpg' config['gid'] = config['gid'] + 1 ri = s.get("https://passport.csdn.net" + valimg) with open(fname, 'wb') as f: for chk in ri: f.write(chk) f.close() validateCode = pytesser.image_file_to_string(fname) validateCode = validateCode.strip() validateCode = validateCode.replace(' ', '') validateCode = validateCode.replace('\n', '') result['validateCode'] = validateCode return result def login(usr, pwd, idx): s = requests.Session() r = s.get( 'https://passport.csdn.net/account/login', headers={ 'User-Agent':
import pytesser import model # Testing out Tesseract on a couple of simple images with Chinese characters. text = pytesser.image_file_to_string('shangwu.png', lang='chi_sim', graceful_errors=True) # Chinese characters from Tesseract are encoded as UTF-8 and appear to have two trailing newlines. shangwu = text.decode('utf-8').strip() print shangwu text = pytesser.image_file_to_string('taihaole.png', lang='chi_sim', graceful_errors=True) print text taihaole = text.decode('utf-8').strip() # Connect to the CEDICT database and search for entries with these characters. session = model.connect() sw_entry = session.query(model.Entry).filter_by(simplified=shangwu).one() thl_entry = session.query(model.Entry).filter_by(simplified=taihaole).one() print sw_entry.definition print thl_entry.definition
cwd = os.getcwd() try : os.chdir("C:\Users\MrLevo\Anaconda2\Lib") return pytesser.image_file_to_string(file) finally: os.chdir(cwd) im=Image.open("E:\\image_code.jpg") imgry = im.convert('L')#图像加强,二值化 sharpness =ImageEnhance.Contrast(imgry)#对比度增强 sharp_img = sharpness.enhance(2.0) sharp_img.save("E:\\image_code.jpg") #http://www.cnblogs.com/txw1958/archive/2012/02/21/2361330.html #imgry.show()#这是分布测试时候用的,整个程序使用需要注释掉 #imgry.save("E:\\image_code.jpg") code= pytesser.image_file_to_string("E:\\image_code.jpg")#code即为识别出的图片数字str类型 print code #打印code观察是否识别正确 #---------------------------------------------------------------------- if i <= 2: # 根据自己登录特性,我这里是验证码失败一次,重填所有,失败两次,重填验证码 elem_user.send_keys('S315080092') elem_psw.send_keys('xxxxxxxxxx') elem_code.send_keys(code) click_login = driver.find_element_by_xpath("//img[@src='main_images/images/loginbutton.gif']") click_login.click() #time.sleep(5)#搜索结果页面停留片刻
def run(filename, debug=False): content = pytesser.image_file_to_string(filename) def find_match(text, pattern, minimum=None): minimum = minimum if minimum else 0.7 score, match, start, end = fuzzy.bitap(text, pattern) if not match or score <= minimum: return None, None, None, None return score, match, start, end content = unicode(content, 'utf-8') _, alarm, _, end = find_match(content, 'ALARMDEPESCHE') keys = json.loads(file('ocr/keywords.json').read()) keys = dict(sorted(keys.items(), key=lambda (n, _): len(n), reverse=True)) if alarm is not None: content = content[end:] _, engines, start, _ = find_match(content, 'Einsatzmittelliste') content = content[:start] if debug: print content original = content for name, key in keys.items(): threshold = key.get('threshold') score, match, start, end = find_match(content, name, threshold) if match: skip = False for exclude in key.get('exclude', []): alt, _, _, _ = find_match(match, exclude, keys[exclude].get('threshold')) if alt > score: skip = True if skip: continue newline = content[:start].rfind('\n') if newline == -1 or (start - newline) <= 10: key['score'] = score key['match'] = match key['start'] = start key['end'] = end content = content[:start] + re.sub(r'[^\n]', ' ', match) + content[end:] tokens = sorted(filter(lambda (_, k): k.get('match') is not None, keys.items()), key=lambda (_, k): k['start']) for current, next in peek(tokens): _, token = current if next: _, next_token = next end = next_token['start'] else: end = len(original) token['content'] = re.sub(r'^[ .:‘]+', '', original[token['end']:end].strip()).strip() tokens = filter(lambda (_, k): not k.get('ignore', False), tokens) if debug: import pprint pprint.PrettyPrinter(indent=4).pprint(tokens) return {name: token['content'] for name, token in tokens} return []
) # i = 0 # while 1: # i = i+1 # try: elem_user = driver.find_element_by_name("username") elem_pwd = driver.find_element_by_name("password") elem_captcha = driver.find_element_by_name("j_captcha_response") driver.get_screenshot_as_file('captcha.jpg') rangle = (1669, 494, 1807, 541) #写成我们需要截取的位置坐标 i = Image.open("/Users/wangmian/PycharmProjects/selenium/captcha.jpg") #打开截图 realcaptcha = i.crop(rangle) #使用Image的crop函数,从截图中再次截取我们需要的区域 realcaptcha.save("/Users/wangmian/PycharmProjects/selenium/realcaptcha.png") code = pytesser.image_file_to_string( '/Users/wangmian/PycharmProjects/selenium/realcaptcha.png') # # # elem_user.send_keys("20141001146") elem_pwd.send_keys("******") time.sleep(0.5) elem_captcha.send_keys(code) elem_pwd.send_keys(Keys.RETURN) time.sleep(5) driver.close() driver.quit()
def run(filename, debug=False): content = pytesser.image_file_to_string(filename) def find_match(text, pattern, minimum=None): minimum = minimum if minimum else 0.7 score, match, start, end = fuzzy.bitap(text, pattern) if not match or score <= minimum: return None, None, None, None return score, match, start, end content = unicode(content, 'utf-8') _, alarm, _, end = find_match(content, 'ALARMDEPESCHE') keys = json.loads(file('ocr/keywords.json').read()) keys = dict(sorted(keys.items(), key=lambda (n, _): len(n), reverse=True)) if alarm is not None: content = content[end:] if debug: print content original = content for name, key in keys.items(): threshold = key.get('threshold') score, match, start, end = find_match(content, name, threshold) if match: skip = False for exclude in key.get('exclude', []): alt, _, _, _ = find_match(match, exclude, keys[exclude].get('threshold')) if alt > score: skip = True if skip: continue newline = content[:start].rfind('\n') if newline == -1 or (start - newline) <= 10: key['score'] = score key['match'] = match key['start'] = start key['end'] = end content = content[:start] + re.sub(r'[^\n]', ' ', match) + content[end:] tokens = sorted(filter(lambda (_, k): k.get('match') is not None, keys.items()), key=lambda (_, k): k['start']) previous = None for current, next in peek(tokens): _, token = current start = token['end'] if next: _, next_token = next end = next_token['start'] else: end = len(original) def trim(s): return re.sub(r'^[ .:‘]+', '', s.strip()).strip() s = trim(original[start:end]) if previous and token.get('previousLine', False): _, previous_token = previous previous_content = previous_token['content'] original_find = original.find(previous_content) + len(previous_content) s = trim(original[original_find:token['start']]) + s if token.get('singleLine', False): rows = s.splitlines() token['content'] = rows[0] elif token.get('table', False): rows = filter(None, s.splitlines()) token['content'] = filter(None, [group(re.search(r'^(.{10,}?) ?', row)) for row in rows[1:]]) else: token['content'] = s previous = current tokens = filter(lambda (_, k): not k.get('ignore', False), tokens) if debug: import pprint pprint.PrettyPrinter(indent=4).pprint(tokens) return {name: token['content'] for name, token in tokens} return []
def parse(s, html, idx): result = {} tree = etree.HTML(html) try: result['lt'] = tree.xpath('//input[@name="lt"]/@value')[0] result['execution'] = tree.xpath('//input[@name="execution"]/@value')[0] result['path'] = tree.xpath('//form[@id="fm1"]/@action')[0] except IndexError, e: return None valimg = None valimgs = tree.xpath('//img[@id="yanzheng"]/@src') if len(valimgs) > 0: valimg = valimgs[0] validateCode = None if valimg: fname = 'img/' str(idx) '_' str(config['gid']) '.jpg' config['gid'] = config['gid'] 1 ri = s.get("https://passport.csdn.net" valimg) with open(fname, 'wb') as f: for chk in ri: f.write(chk) f.close() validateCode = pytesser.image_file_to_string(fname) validateCode = validateCode.strip() validateCode = validateCode.replace(' ', '') validateCode = validateCode.replace('\n', '') result['validateCode'] = validateCode return result def login(usr, pwd, idx): s = requests.Session() r = s.get('https://passport.csdn.net/account/login', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Host': 'passport.csdn.net', }) while True: res = parse(s, r.text, idx) if res == None: return False url = 'https://passport.csdn.net' res['path'] form = {'username': usr, 'password':pwd, '_eventId':'submit', 'execution':res['execution'], 'lt':res['lt'],} if res.has_key('validateCode'): form['validateCode'] = res['validateCode'] s.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'passport.csdn.net', 'Origin': 'https://passport.csdn.net', 'Referer': 'https://passport.csdn.net/account/login', 'Upgrade-Insecure-Requests': 1, }) r = s.post(url, data=form) tree = etree.HTML(r.text) err_strs = tree.xpath('//span[@id="error-message"]/text()') if len(err_strs) == 0: return True err_str = err_strs[0] print err_str err = err_str.encode('utf8') validate_code_err = '驗證碼錯誤' usr_pass_err = '帳戶名或登入密碼不正確,請重新輸入' try_later_err = '登入失敗連續超過5次,請10分鐘後再試' if err[:5] == validate_code_err[:5]: pass elif err[:5] == usr_pass_err[:5]: return False elif err[:5] == try_later_err[:5]: return False else: return True if __name__ == '__main__': main(sys.argv[1], sys.argv[2], 0)
def OCRreadline(img, *args): """ read line using OCR, separate with commas img: contains single line """ left_edge = 0 # start from left edge top_edge = 0 # static! res_str = "" hit_class = "" nList = len(args) line_str = [] i = 1 for arg in args: width = arg # pdb.set_trace() sub_cropped_img = crop(img, left_edge, top_edge, width, img.shape[1]) # extract relevant pixels based on color masked_img = filter_color(sub_cropped_img, LBOUND_H, LBOUND_S, LBOUND_V, UBOUND_H, UBOUND_S, UBOUND_V) # extract only one color group, e.g. allies lb_h = COLOR_ALLIES[0]-RANGE_HUE if COLOR_ALLIES[0]-RANGE_HUE>=0 else 0 lb_s = COLOR_ALLIES[1]-RANGE_SAT if COLOR_ALLIES[1]-RANGE_SAT>=0 else 0 lb_v = COLOR_ALLIES[2]-RANGE_VAL if COLOR_ALLIES[2]-RANGE_VAL>=0 else 0 ub_h = COLOR_ALLIES[0]+RANGE_HUE if COLOR_ALLIES[0]+RANGE_HUE<=179 else 179 ub_s = COLOR_ALLIES[1]+RANGE_SAT if COLOR_ALLIES[1]+RANGE_SAT<=255 else 255 ub_v = COLOR_ALLIES[2]+RANGE_VAL if COLOR_ALLIES[2]+RANGE_VAL<=255 else 255 # print(lb_h, lb_s, lb_v, ub_h, ub_s, ub_v) masked_img_single = filter_color(sub_cropped_img, lb_h, lb_s, lb_v, ub_h, ub_s, ub_v) # cv2.imshow("after filtering", preproc_img) # cv2.waitKey(500) cv2.imwrite( "tmp.png", masked_img); if i < 3 and np.any(masked_img_single != 255): hit_class = HIT_CLASS1 elif i == 3 and np.any(masked_img_single != 255): hit_class = HIT_CLASS2 cv2.imwrite( "tmp.png", masked_img) # cv2.imshow("current mask", masked_img) cv2.imshow("current mask", masked_img) cv2.waitKey(500) line_str.append(pytesser.image_file_to_string("tmp.png").rstrip("\n\r")) # remove newline at end of string) # TODO: if line_str empty, try with different setting; limit characters to single font type? # save entity # res_str += entity + ',' # update left edge -> move to right left_edge += width i += 1 if hit_class == "": # print("Color class unknown", end='') line_str.append("N/A") else: # print(hit_class, end='') line_str.append(hit_class) # # last iteration subroutine # masked_img = crop(img, left_edge, top_edge, width, img.shape[1]) # cv2.imwrite( "tmp.png", masked_img); # entity = pytesser.image_file_to_string("tmp.png").rstrip("\n\r") # remove newline at end of string # newline at end of image line # res_str = res_str.rstrip(',') + '\n' res_str = line_str print("{}".format(line_str)) # pdb.set_trace() return line_str
from PIL import Image import pytesser image = Image.open('test.jpg') print pytesser.image_file_to_string('test.jpg') print pytesser.image_to_string(image)