def img2str(self, captcha_body): with BytesIO(captcha_body) as captcha_filelike, Image.open( captcha_filelike) as img: new_img = img.convert('L') # 转换为RGBA pix = new_img.load() # 转换为像素 # 处理上下黑边框,size[0]即图片长度 for x in range(new_img.size[0]): pix[x, 0] = pix[x, new_img.size[1] - 1] = 255 # 处理左右黑边框,size[1]即图片高度 for y in range(new_img.size[1]): pix[0, y] = pix[new_img.size[0] - 1, y] = 255 # 二值化处理,这个阈值为140比较合适 threshold = 140 # 阈值 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) new_img = new_img.point(table, '1') # 识别图片上的值 text = get_text_from_image( new_img, psm=7, tessedit_char_whitelist=self.captcha_char_whitelist).replace( ' ', '') new_img.close() return text
def parse_capatcha(captcha_body): with BytesIO(captcha_body) as captcha_filelike, Image.open(captcha_filelike) as img: # img.show() # 构造算子为32位浮点三维矩阵kernel:[(1 / 20, 1 / 20, 1 / 20, 1 / 20, 1 / 20) # (1 / 20, 1 / 20, 1 / 20, 1 / 20, 1 / 20) # (1 / 20, 1 / 20, 1 / 20, 1 / 20, 1 / 20) # (1 / 20, 1 / 20, 1 / 20, 1 / 20, 1 / 20) # (1 / 20, 1 / 20, 1 / 20, 1 / 20, 1 / 20)] # kernel = numpy.ones((5, 5), numpy.float32) / 19 # sobelX = numpy.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) # sobelY = numpy.array([[1, 2, 1], [0, 0, 0], [-1, -2, -1]]) # kernel = numpy.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]) # 做卷积去噪点 eroded = numpy.array(img) eroded = cv2.fastNlMeansDenoisingColored(eroded) mask_img_arr = numpy.zeros((eroded.shape[0], eroded.shape[1]), numpy.uint8) dst_img = numpy.array(img) cv2.inpaint(eroded, mask_img_arr, 10, cv2.INPAINT_TELEA, dst=dst_img) # 图像灰度化处理 eroded = cv2.cvtColor(eroded, cv2.COLOR_BGR2GRAY) # 图像二值化处理 ret, eroded = cv2.threshold(eroded, 125, 255, cv2.THRESH_BINARY) dest_img = Image.fromarray(eroded) code = get_text_from_image(dest_img, tessedit_char_whitelist=captcha_char_whitelist).replace(' ', '') dest_img.close() return code
def img2str(captcha_body): captcha_char_whitelist = digits with Image.open(captcha_body) as img: new_img = img.convert('L') # 转换为RGBA pix = new_img.load() # 转换为像素 # # 处理上下黑边框,size[0]即图片长度 # for x in range(new_img.size[0]): # pix[x, 0] = pix[x, new_img.size[1] - 1] = 255 # # 处理左右黑边框,size[1]即图片高度 # for y in range(new_img.size[1]): # pix[0, y] = pix[new_img.size[0] - 1, y] = 255 # 二值化处理,这个阈值为140比较合适 threshold = 180 # 阈值 # 201 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) new_img = new_img.point(table, '1') # 保存图片下来方便后面训练 # new_img.save("captcha/" + str(int(time())) + ".jpg") # 识别图片上的值 text = get_text_from_image( new_img, psm=7, tessedit_char_whitelist=captcha_char_whitelist).replace(' ', '') new_img.close() return text
def test_options(self): quick_fox_image = get_test_image('quickfox.png') text = get_text_from_image( quick_fox_image, psm=10 #single character ) assert isinstance(text, six.text_type) assert len(text) == 1
def test_configs(self): allowed_chars = "0123456789-" white_list_set = set(allowed_chars) alphanum_image = get_test_image('alphanumeric.png') #default with alphas text = get_text_from_image(alphanum_image) assert isinstance(text, six.text_type) assert not all(char in white_list_set for char in text if char != ' ') #digits config file text = get_text_from_image(alphanum_image, config_name='digits') assert isinstance(text, six.text_type) assert all(char in white_list_set for char in text if char != ' ') #manual config allowed_chars = "123" white_list_set = set(allowed_chars) text = get_text_from_image( alphanum_image, tessedit_char_whitelist=allowed_chars, ) assert isinstance(text, six.text_type) assert all(char in white_list_set for char in text if char != ' ')
def recognize_captcha_by_tesseract(img, digits_only=False, letters_only=False, del_noise=False): if digits_only: whitelist = digits elif letters_only: whitelist = ascii_letters else: whitelist = all_chars img = convert_img_2_baw(img) if del_noise: img = del_img_noise(img) text = get_text_from_image(img, tessedit_char_whitelist=whitelist) return blank_pattern.sub("", text)
def test_simple_sentence(self): actual_text = "The quick brown fox jumps over the lazy dog" quick_fox_image = get_test_image('quickfox.bmp') text = get_text_from_image(quick_fox_image) assert isinstance(text, six.text_type) assert check_similarity_ratio(text, actual_text)
def test_blank_image(self): blank_image = Image.new("RGB", (100, 100), color=(255, 255, 255)) text = get_text_from_image(blank_image) assert isinstance(text, six.text_type) assert len(text) == 0
def test_blank_image(self): blank_image = Image.new("RGB", (100,100), color=(255, 255, 255)) text = get_text_from_image(blank_image) assert isinstance(text, six.text_type) assert len(text) == 0
def extract_text_by_google(imagePath): with Image.open(imagePath) as img: return get_text_from_image(img)