Exemplo n.º 1
0
 def extract_text(self, is_captcha=False):
     image = (self.image if not isinstance(self.image, str) else Image.open(
         self.image)).convert('L')
     return is_captcha and sub('[\W]', '',
                               (image_to_string(image)
                                or '').strip()) or image_to_string(
                                    image)  # noqa
Exemplo n.º 2
0
    def get_text_from_file(self, filepath):
        from pyocr import tesseract, builders
        from PIL import Image
        image = Image.open(filepath)

        text = tesseract.image_to_string(image=image, builder=builders.TextBuilder())
        return text.strip('\r\n')
Exemplo n.º 3
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang,
                                             builder=self.builder)
        boxes.sort()

        self.assertTrue(len(boxes) > 0)
        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            try:
                # python 2.7
                self.assertEqual(type(expected_boxes[i].content), unicode)
                self.assertEqual(type(boxes[i].content), unicode)
            except NameError:
                # python 3
                self.assertEqual(type(expected_boxes[i].content), str)
                self.assertEqual(type(boxes[i].content), str)
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 4
0
def test_text(image_file, lang='eng'):
    print image_file

    return tesseract.image_to_string(
            Image.open(image_file),
            lang=lang,
            builder=tesseract.DigitBuilder())
Exemplo n.º 5
0
    def _read_from_img(self, image_path, lang=None):
        boxes = tesseract.image_to_string(Image.open(image_path),
                                          lang=lang,
                                          builder=self._builder)
        boxes.sort()

        return boxes
    def get_digit(self, image, min_score):
        if self.recognize_digit:
            import pyocr
            import pyocr.tesseract as tess
            import pyocr.builders
            from PIL import Image

            digit_ratio = (0.1016, 0.1836, 0.2228, 0.2268)
            digits = selectROI(image, ratio=digit_ratio, round8=False)
            bw_img = digits
            cv2.imwrite('digit_sample.jpg', bw_img)
            txt = tess.image_to_string(Image.fromarray(bw_img),
                                       lang='eng',
                                       builder=pyocr.builders.TextBuilder())
            txt = txt.replace(',', '').replace('.', '')
            if txt.isdigit(
            ) and int(txt) >= min_score:  # reward should not decrease
                score = int(txt) / 100.0
            else:
                score = min_score
            # print("TXT:", txt, 'score:', score , 'last_score:', last_score)
            # scale score

            return score
        else:
            return 0
Exemplo n.º 7
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file),
                                          lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertTrue(len(boxes) > 0)
        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            try:
                # python 2.7
                self.assertEqual(type(expected_boxes[i].content), unicode)
                self.assertEqual(type(boxes[i].content), unicode)
            except NameError:
                # python 3
                self.assertEqual(type(expected_boxes[i].content), str)
                self.assertEqual(type(boxes[i].content), str)
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 8
0
 def test_text_error_file(self, run_tesseract, copen, temp_dir):
     run_tesseract.return_value = (0, "")
     copen.side_effect = Exception("Unknown error")
     with TemporaryDirectory(prefix="tess_") as tmpdir:
         enter = MagicMock()
         enter.__enter__.return_value = tmpdir
         temp_dir.return_value = enter
         with self.assertRaises(Exception):
             tesseract.image_to_string(self.image, builder=self.builder)
     run_tesseract.assert_called_once_with(
         "input.bmp",
         "output",
         cwd=tmpdir,
         lang=None,
         flags=self.builder.tesseract_flags,
         configs=self.builder.tesseract_configs,
     )
Exemplo n.º 9
0
 def test_text_cannot_open_file(self, run_tesseract, copen, temp_dir):
     run_tesseract.return_value = (0, "")
     copen.side_effect = PermissionError(errno.EPERM, "Error opening file")
     with TemporaryDirectory(prefix="tess_") as tmpdir:
         enter = MagicMock()
         enter.__enter__.return_value = tmpdir
         temp_dir.return_value = enter
         with self.assertRaises(PermissionError):
             tesseract.image_to_string(self.image, builder=self.builder)
     run_tesseract.assert_called_once_with(
         "input.bmp",
         "output",
         cwd=tmpdir,
         lang=None,
         flags=self.builder.tesseract_flags,
         configs=self.builder.tesseract_configs,
     )
Exemplo n.º 10
0
    def main(self, text_img_name):
        
        txt = tool.image_to_string(
            Im.open(text_img_name), lang=self.lang,
            builder=pyocr.builders.TextBuilder()
        )

        return txt
Exemplo n.º 11
0
    def _read_from_img(self, image_path, lang=None):
        boxes = tesseract.image_to_string(
            Image.open(image_path),
            lang=lang,
            builder=self._builder
        )
        boxes.sort()

        return boxes
Exemplo n.º 12
0
def find_secret_rects(image, secret_res, lang, tesseract_configs=None):
    """
    Find secret rects in an image.

    param: Image image
    param: list secret_res
    param: str lang
    param: str tesseract_configs
    return: list of rects
    rtype: list
    """

    # When using pyocr, an input image is converted to RGB (not RGBA).
    # During the conversion, transparent pixels are converted into BLACK.
    # Sometimes the black pixels get in the way of recognizing text.
    #
    # For example, macOS's screenshot image taken by Command+Shift+4+Space
    # has transparent pixels around an window. This results in a black and
    # thick border in the edge of image. The border worsen quality of OCR
    # of text near by the border. To avoid this, convert transparent pixels
    # into WHITE by pasting image into an white background.
    #
    # See: http://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
    if image.mode == 'RGBA':
        background = Image.new('RGB', image.size, (255, 255, 255))
        background.paste(image, mask=image.split()[3])  # Paste image masked by alpha channel [3]
        image = background

    # offset = (0, 150)
    # cropped_image = image.crop((offset[0], offset[1], image.size[0], 220))
    offset = (0, 0)
    cropped_image = image

    builder = ModifiedCharBoxBuilder(cropped_image.size[1])
    if tesseract_configs:
        builder.tesseract_configs = tesseract_configs

    boxes = image_to_string(cropped_image, lang=lang, builder=builder)

    if os.environ.get('DEBUG'):
        for box in boxes:
            print(box.content, box.position)

    content = ''.join(box.content for box in boxes)
    assert len(boxes) == len(content)

    secret_rects = []
    for secret_re in secret_res:
        for m in secret_re.finditer(content):
            matched_boxes = boxes[m.start():m.end()]
            matched_rects = [b.position for b in matched_boxes]
            for rect in bounding_boxes_by_line(matched_rects):
                rect = offset_rect(offset, padding_box(rect, 2))
                secret_rects.append(rect)

    return secret_rects
Exemplo n.º 13
0
 def test_char_error(self, run_tesseract, copen, temp_dir):
     run_tesseract.return_value = (1, "Error")
     copen.return_value = StringIO(self._get_file_content("boxes"))
     with TemporaryDirectory(prefix="tess_") as tmpdir:
         enter = MagicMock()
         enter.__enter__.return_value = tmpdir
         temp_dir.return_value = enter
         with self.assertRaises(tesseract.TesseractError) as te:
             tesseract.image_to_string(self.image, builder=self.builder)
     self.assertEqual(te.exception.status, 1)
     self.assertEqual(te.exception.message, "Error")
     run_tesseract.assert_called_once_with(
         "input.bmp",
         "output",
         cwd=tmpdir,
         lang=None,
         flags=self.builder.tesseract_flags,
         configs=self.builder.tesseract_configs,
     )
Exemplo n.º 14
0
 def test_char_no_output(self, run_tesseract, copen, temp_dir):
     run_tesseract.return_value = (0, "No file output")
     copen.side_effect = FileNotFoundError(
         errno.ENOENT, "[Errno 2] No such file or directory: 'output'")
     with TemporaryDirectory(prefix="tess_") as tmpdir:
         enter = MagicMock()
         enter.__enter__.return_value = tmpdir
         temp_dir.return_value = enter
         with self.assertRaises(tesseract.TesseractError) as te:
             tesseract.image_to_string(self.image, builder=self.builder)
     self.assertEqual(te.exception.status, -1)
     self.assertIn("Unable to find output file (tested",
                   te.exception.message)
     run_tesseract.assert_called_once_with(
         "input.bmp",
         "output",
         cwd=tmpdir,
         lang=None,
         flags=self.builder.tesseract_flags,
         configs=self.builder.tesseract_configs,
     )
Exemplo n.º 15
0
    def __test_txt(self, image_file, expected_output_file, lang="eng"):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/tesseract/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, "r", encoding="utf-8") as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Exemplo n.º 16
0
    def main(self, path):

        count = 0
        other_files = 0

        for f in os.listdir(path):  #Return list of files in path directory

            ext = os.path.splitext(
                f
            )[1]  #Split the pathname path into a pair i.e take .png/ .jpg etc

            if ext.lower(
            ) not in VALIDITY:  #Convert to lowercase and check in validity list
                other_files += 1  #Increment if other than validity extension found
                #sys.stdout.write("Extension other than image is not supported. \n")
                continue

            else:

                count += 1

                image_file_name = path + '/' + f  #Full /dir/path/filename.extension

                txt = tool.image_to_string(
                    Im.open(image_file_name),
                    lang=self.lang,
                    builder=pyocr.builders.TextBuilder())

                initial = txt.replace('\a', ' ').replace('\b', ' ').replace(
                    '\f', ' '
                ).replace('\n', ' ').replace('\r', '').replace(
                    '\t', ' '
                ).replace(
                    '\v', ' '
                )  #.replace(' ','_') #.replace('.','_') #Replace \n and \t with space
                initial = initial[:60]  #Take 1st 100 words
                print('Filename:' + initial + '\n')

                os.chmod(path, 0o777)
                os.rename(image_file_name, path + '/' + initial + ext)

                print(
                    str(count) + (" file" if count == 1 else " files") +
                    " processed")

        if count + other_files == 0:
            print("No files found")  #No files found
        else:
            print(
                str(count) + " / " + str(count + other_files) +
                " files converted")
Exemplo n.º 17
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/tesseract/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Exemplo n.º 18
0
    def __test_text(self, image_file, expected_output_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/tesseract/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder)

        self.assertEqual(output, expected_output)
Exemplo n.º 19
0
    def __test_txt(self, image_file, expected_box_file, lang="eng"):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        with codecs.open(expected_box_file, "r", encoding="utf-8") as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 20
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_output_file = os.path.join("tests", "output", "specific",
                                            "tesseract", expected_output_file)

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Exemplo n.º 21
0
def rec_img(img):
    
    width = img.size[0]
    height = img.size[1]
    #构造指数的位置
    rangle = (24.5+6.1*length+5,int(height/2),int(width),int(height))  #左、上、右、下
    # 打开截图切割
    img = img.crop(rangle)
    # 将图片放大
    (x, y) = img.size
    x_s = int(x*2.4)
    y_s = int(y*2.4)
    imgzoom = img.resize((x_s,y_s),Image.ANTIALIAS)
    code = tesseract.image_to_string(imgzoom)
    result = re.sub("\D", "", code)
    return result
Exemplo n.º 22
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_output_file = os.path.join(
            "tests", "output", "specific", "tesseract", expected_output_file
        )

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = tesseract.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Exemplo n.º 23
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 24
0
        def login(username,password):
            
            while 1 :
                name="code.jpg"
                threshold = 140  
                table = []  
                for i in range(256):  
                    if i < threshold:  
                        table.append(0)  
                    else:  
                        table.append(1) 
                try :
                    image = requests.get('http://www.ourui.com/userself/RndCode.asp?rndtype=LOGIN_RndCode')
                    #print(image.content)
                    f=open(name,"wb+")
                    f.write(image.content)
                    f.close()
                    imagefile = Image.open(name)
                        

                    #转化到亮度  
                    imgry = imagefile.convert('L')  
                    imgry.save('g'+name)  
                    #二值化  
                    #out = imgry.point(table,'1')
                    out = imgry.point(lambda x: 255 if x > 141 else 0)  
                    out.save('b'+name)  
                    #识别  
                    
                    
                    val=tesseract.image_to_string(out)
                    print(val)
                    imgreq=requests.get("http://www.ourui.com/UserSelf/rndcode_check.asp?name=lg&rndcode="+val)
                    print(imgreq.content)
                    if imgreq.content=="0" or imgreq.content=="":
                        continue
                    r = requests.post("http://www.ourui.com/userself/login.asp", data={"username":username,"userpass":password})
                    r.content
                    print("login ok register",self.domain)
                    
                    break
                except:
                    
                    time.sleep(1)
                    print("矮油网速太渣,登录失败,重新登录!")
Exemplo n.º 25
0
 def get_data(self, region, rerun=0,
              image_optimizer=None, data_optimizer=None, threshold=None):
     '''Gets data out of a single region
     :param rerun: incrementing integer for every rerun.
     :param image_optimizer: callback, takes PIL.Image and rerun as params.
     :param data_optimizer: callback, takes String and rerun as params.
     :param threshold: callback, takes String and should return False for a rerun.
     '''
     if image_optimizer:
         optiregion = image_optimizer(region, rerun)
     data = tesseract.image_to_string(optiregion)
     if data_optimizer:
         data = data_optimizer(data, rerun)
     if threshold and not threshold(data):
         return self.get_data(
             region, rerun + 1,
             image_optimizer, data_optimizer, threshold)
     else:
         return data
Exemplo n.º 26
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_box_file = os.path.join("tests", "output", "specific",
                                         "tesseract", expected_box_file)

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file),
                                          lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 27
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_box_file = os.path.join(
            "tests", "output", "specific", "tesseract", expected_box_file
        )

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang,
                                             builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 28
0
    def test_text(self, run_tesseract, copen, temp_dir):
        run_tesseract.return_value = (0, "")
        copen.return_value = StringIO(self._get_file_content("text"))
        with TemporaryDirectory(prefix="tess_") as tmpdir:
            enter = MagicMock()
            enter.__enter__.return_value = tmpdir
            temp_dir.return_value = enter
            result = tesseract.image_to_string(self.image,
                                               builder=self.builder)

        self.assertEqual(result, self._get_file_content("text").strip())
        run_tesseract.assert_called_once_with(
            "input.bmp",
            "output",
            cwd=tmpdir,
            lang=None,
            flags=self.builder.tesseract_flags,
            configs=self.builder.tesseract_configs,
        )
Exemplo n.º 29
0
    def test_write_read(self):
        original_boxes = tesseract.image_to_string(Image.open("tests/data/test.png"), builder=self.builder)
        self.assertTrue(len(original_boxes) > 0)

        (file_descriptor, tmp_path) = tempfile.mkstemp()
        try:
            # we must open the file with codecs.open() for utf-8 support
            os.close(file_descriptor)

            with codecs.open(tmp_path, "w", encoding="utf-8") as fdescriptor:
                self.builder.write_file(fdescriptor, original_boxes)

            with codecs.open(tmp_path, "r", encoding="utf-8") as fdescriptor:
                new_boxes = self.builder.read_file(fdescriptor)

            self.assertEqual(len(new_boxes), len(original_boxes))
            for i in range(0, len(original_boxes)):
                self.assertEqual(new_boxes[i], original_boxes[i])
        finally:
            os.remove(tmp_path)
Exemplo n.º 30
0
    def test_char(self, run_tesseract, copen, temp_dir):
        run_tesseract.return_value = (0, "")
        copen.return_value = StringIO(self._get_file_content("boxes"))
        with TemporaryDirectory(prefix="tess_") as tmpdir:
            enter = MagicMock()
            enter.__enter__.return_value = tmpdir
            temp_dir.return_value = enter
            result = tesseract.image_to_string(self.image,
                                               builder=self.builder)

        for box in result:
            self.assertIsInstance(box, builders.Box)
        run_tesseract.assert_called_once_with(
            "input.bmp",
            "output",
            cwd=tmpdir,
            lang=None,
            flags=self.builder.tesseract_flags,
            configs=self.builder.tesseract_configs,
        )
Exemplo n.º 31
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/tesseract/" + expected_box_file

        boxes = tesseract.image_to_string(Image.open(image_file), lang=lang,
                                             builder=self.builder)
        boxes.sort()

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            for j in range(0, len(boxes[i].word_boxes)):
                self.assertEqual(type(boxes[i].word_boxes[j]),
                                 type(expected_boxes[i].word_boxes[j]))
            self.assertEqual(boxes[i], expected_boxes[i])
Exemplo n.º 32
0
    def test_write_read(self):
        original_boxes = tesseract.image_to_string(
            Image.open("tests/data/test.png"), builder=self.builder)
        self.assertTrue(len(original_boxes) > 0)

        (file_descriptor, tmp_path) = tempfile.mkstemp()
        try:
            # we must open the file with codecs.open() for utf-8 support
            os.close(file_descriptor)

            with codecs.open(tmp_path, 'w', encoding='utf-8') as fdescriptor:
                self.builder.write_file(fdescriptor, original_boxes)

            with codecs.open(tmp_path, 'r', encoding='utf-8') as fdescriptor:
                new_boxes = self.builder.read_file(fdescriptor)

            self.assertEqual(len(new_boxes), len(original_boxes))
            for i in range(0, len(original_boxes)):
                self.assertEqual(new_boxes[i], original_boxes[i])
        finally:
            os.remove(tmp_path)
Exemplo n.º 33
0
    def test_text_non_rgb_image(self, run_tesseract, copen, temp_dir):
        """This tests that image_to_string works with non RGB mode images and
        that image is converted in function."""
        image = self.image.convert("L")
        run_tesseract.return_value = (0, "")
        copen.return_value = StringIO(self._get_file_content("text"))
        with TemporaryDirectory(prefix="tess_") as tmpdir:
            enter = MagicMock()
            enter.__enter__.return_value = tmpdir
            temp_dir.return_value = enter
            result = tesseract.image_to_string(image, builder=self.builder)

        self.assertEqual(result, self._get_file_content("text").strip())
        run_tesseract.assert_called_once_with(
            "input.bmp",
            "output",
            cwd=tmpdir,
            lang=None,
            flags=self.builder.tesseract_flags,
            configs=self.builder.tesseract_configs,
        )
Exemplo n.º 34
0
 def get_data(self,
              region,
              rerun=0,
              image_optimizer=None,
              data_optimizer=None,
              threshold=None):
     '''Gets data out of a single region
     :param rerun: incrementing integer for every rerun.
     :param image_optimizer: callback, takes PIL.Image and rerun as params.
     :param data_optimizer: callback, takes String and rerun as params.
     :param threshold: callback, takes String and should return False for a rerun.
     '''
     if image_optimizer:
         optiregion = image_optimizer(region, rerun)
     data = tesseract.image_to_string(optiregion)
     if data_optimizer:
         data = data_optimizer(data, rerun)
     if threshold and not threshold(data):
         return self.get_data(region, rerun + 1, image_optimizer,
                              data_optimizer, threshold)
     else:
         return data
Exemplo n.º 35
0
    def test_digits(self, run_tesseract, copen, temp_dir):
        run_tesseract.return_value = (0, "")
        copen.return_value = StringIO(self._get_file_content("digits"))
        with TemporaryDirectory(prefix="tess_") as tmpdir:
            enter = MagicMock()
            enter.__enter__.return_value = tmpdir
            temp_dir.return_value = enter
            with open(os.path.join(tmpdir, "output.txt"), "w") as fh:
                fh.write("")
            result = tesseract.image_to_string(self.image,
                                               builder=self.builder)

        for digit in result:
            self.assertIsInstance(int(digit), int)
        run_tesseract.assert_called_once_with(
            "input.bmp",
            "output",
            cwd=tmpdir,
            lang=None,
            flags=self.builder.tesseract_flags,
            configs=self.builder.tesseract_configs,
        )
Exemplo n.º 36
0
def get_text(img_byte):
    """识别"""
    im = Image.open(BytesIO(img_byte))
    # 去底色
    im = im.point(lambda i: 255 if i > 180 else 0)

    # 去除干扰线
    size = im.size
    pimx = im.load()
    for x in range(size[0]):
        for y in range(size[1]):
            px = pimx[x, y]
            if px[0] == 0 and px[1] == 0 and px[2] == 0:
                pimx[x, y] = pimx[x, 0 if y == 0 else y - 1]

    im = ImageOps.invert(im).convert("1")
    old_width, old_height = im.size
    im.thumbnail((old_width*0.7, old_height*0.7))

    arr = np.array(im).sum(axis=0)
    print(arr)

    # 剪裁
    region = im.crop((36, 1, 105, 34))

    # OCR
    builder = tesseract.builders.DigitBuilder()
    digits_address = os.path.join(os.getcwd(), 'config/digits')
    print("digits_address -> ", digits_address)
    builder.tesseract_configs = ['-psm', '7', digits_address]
    result = tesseract.image_to_string(region, 'eng', builder)

    code_text = result.replace(' ', '')
    print("out text -> ", code_text)
    if len(code_text) == 4:
        return str(code_text)
    else:
        return "0"
Exemplo n.º 37
0
def get_text(img_byte, card_modul):
    """识别卡号"""
    im = Image.open(BytesIO(img_byte))
    print(card_modul['typeId'], card_modul['name'])
    if card_modul['typeId'] == 1:
        # 剪裁
        region = im.crop(card_modul["cut"])
    elif card_modul['typeId'] == 2:
        # 剪裁
        region = im.crop(card_modul["cut"])
    elif card_modul['typeId'] == 3:
        # 剪裁
        region = im.crop(card_modul["cut"])

    region = ImageOps.invert(region).convert("L")
    region.show()

    # OCR
    builder = tesseract.builders.DigitBuilder()
    digits_address = os.path.join(os.getcwd(), 'config/digits')
    builder.tesseract_configs = ['-psm', '7', digits_address]
    result = tesseract.image_to_string(region, 'eng', builder)
    return result.replace(' ', '')
Exemplo n.º 38
0
def extract(image_file, spellchecker=None):
    text = pytesseract.image_to_string(Image.open(image_file), lang="deu")
    clean_text = []
    for word in re.findall(r"\w+", text):
        word = word.strip()
        if not word:
            continue
        if word.isdigit():
            clean_text.append(word)
            continue
        if len(word) == 1:
            clean_text.append(word)
            continue
        if spellchecker:
            correction = spellchecker(word.lower())
            if correction and not isinstance(correction, str):
                # some spellcheckers return a list of suggestions -> use
                # first suggestion
                correction = correction[0]
        else:
            correction = None
        if correction:
            if word[0].isupper():
                # keep capitalization of first char.
                correction = correction[0].upper() + correction[1:]
            clean_text.append(correction)
        else:
            clean_text.append(word + "?")
    emails = []
    for email in re.findall(
            "\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b",
            text,
            flags=re.UNICODE + re.IGNORECASE,
    ):
        _log.debug("email: %s", email)
        emails.append(email)
    return " ".join(clean_text), emails
Exemplo n.º 39
0
from pyocr import tesseract
from PIL import Image
imagefile = Image.open('code.jpg')
val = tesseract.image_to_string(imagefile)
print(val)
Exemplo n.º 40
0
 def scanImage(self, img):
     if tesseract.is_available():
         return tesseract.image_to_string(Image.open(img))
     else:
         return "FAIL"
Exemplo n.º 41
0
def getText(filename):
    img = Image.open(filename)
    text = tesseract.image_to_string(img)
    return text
Exemplo n.º 42
0
from PIL import Image
from pyocr import tesseract

pic_list = ['pic1.png', 'pic2.png']
for i in pic_list:
    im = Image.open(i)
    im = im.convert('L')  # 图片转换为灰色图像
    # 保存转换后的图片
    im.save("temp.png")
    code = tesseract.image_to_string(im)
    print(code)
import pytesseract as tess
import pytesseract as Output
from PIL import Image
import cv2 as cv
import re
#line 7 is optional
tess.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
from pyocr.tesseract import image_to_string
text = image_to_string(
    Image.open(r'C:\Users\Admin\projects\webapp\media\report\images\med1.png'),
    lang='eng')

img = cv.cv2.imread(
    r'C:\Users\Admin\projects\webapp\media\report\images\med1.png')
custom_config = r'-c tessedit_char_whitelist=medical --psm 6'
print(tess.pytesseract.image_to_string(img, config=custom_config))
#TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract'

#m = Image.open('c:/Users/Admin/projects/webapp/media/report/images/1.png')
#text = pytesseract.image_to_string('m')
import numpy as np
import cv2

from PIL import Image
import sys

import pyocr.tesseract as tess
import pyocr.builders

import time
import os, subprocess

ts = time.time()
txt = tess.image_to_string(Image.open('digit_sample.jpg'),
                           lang='eng',
                           builder=pyocr.builders.TextBuilder())
#print(txt)
#print(time.time() - ts)
# txt is a Python string


def image_to_string(img, cleanup=True, plus=''):
    # cleanup为True则识别完成后删除生成的文本文件
    # plus参数为给tesseract的附加高级参数
    # subprocess.check_output('tesseract ' + img + ' ' +
    #                         img + ' ' + plus, shell=True)  # 生成同名txt文件
    os.popen('tesseract ' + img + ' ' + img + ' ' + plus)
    text = ''
    with open(img + '.txt', 'r') as f:
        text = f.read().strip()
Exemplo n.º 45
0
from pyocr import builders
from PIL import Image, ImageEnhance, ImageFilter

def test_text(image_file, lang='eng'):
    print image_file

    return tesseract.image_to_string(
            Image.open(image_file),
            lang=lang,
            builder=tesseract.DigitBuilder())


print test_text('./123.png')

print Image.open('./123.png')
print tesseract.image_to_string(Image.open('./11.jpg'),
                                lang='eng')
image_name = "./123.png"
im = Image.open(image_name)
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
#im.show()
                #all by pixel
s = 12          #start postion of first number
w = 10          #width of each number
h = 15          #end postion from top
t = 2           #start postion of top

im_new = []
#split four numbers in the picture
        except ValueError as e:
            continue

        # Use wand to convert each page in the PDF into an image blob
        # Loop over blobs
        for img in image_jpeg.sequence:
            img_page = Image(image=img)
            # append as a blob into the req_image list
            req_image.append(img_page.make_blob('jpeg'))

        # run OCR over the image blobs
        txt = ""
        for img in req_image:
            try:
                txt += tool.image_to_string(
                    PI.open(io.BytesIO(img)),
                    lang=lang,
                    builder=pyocr.builders.TextBuilder())
                error_flag = False
            except OSError as e:
                error_flag = True
                print("#####################")
                print("Error: txt")
                print("#####################")
            final_text.append(txt)

        if error_flag:
            continue

        print("#####################")
        print("TEXT FOUND:")
        print(final_text)