Python get_languages示例，tesserocr.get_languages Python示例

示例#1

0

显示文件

文件： recognize.py 项目： kba/ocrd_tesserocr

 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
         log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
             ID = mets_file_id(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )

示例#2

0

显示文件

文件： ocr.py 项目： bobquest33/ingestors

 def is_available(cls):
     try:
         from tesserocr import get_languages
         path, languages = get_languages()
         return len(languages) > 0
     except ImportError:
         return False

示例#3

0

显示文件

文件： ocr.py 项目： bobquest33/ingestors

 def get_languages(self, languages):
     if not hasattr(self, 'supported_languages'):
         from tesserocr import get_languages
         _, self.supported_languages = get_languages()
     codes = set(['eng'])
     for lang in list_to_alpha3(codes):
         if lang in self.supported_languages:
             codes.add(lang)
     return '+'.join(sorted(codes))

示例#4

0

显示文件

文件： ocr_language.py 项目： nestorwheelock/erpnext_ocr

def lang_available(lang):
    """Call Tesseract OCR to verify language is available."""
    list_of_languages = tesserocr.get_languages()[1]
    if len(lang) == 2:
        return frappe.get_doc("OCR Language", {
            "lang": lang
        }).code in list_of_languages

    return lang in list_of_languages

示例#5

0

显示文件

 def language_list(self, languages):
     if not hasattr(settings, 'ocr_supported'):
         with temp_locale(TESSERACT_LOCALE):
             # Tesseract language types:
             from tesserocr import get_languages
             _, settings.ocr_supported = get_languages()
             # log.info("OCR languages: %r", settings.ocr_supported)
     models = [c for c in alpha3(languages) if c in settings.ocr_supported]
     if len(models) > self.MAX_MODELS:
         log.warning("Too many models, limit: %s", self.MAX_MODELS)
         models = models[:self.MAX_MODELS]
     models.append('eng')
     return '+'.join(sorted(set(models)))

示例#6

0

显示文件

文件： engine.py 项目： zhongyujian/findit

    def __init__(self, engine_ocr_lang: str = None, *_, **__):
        logger.info(f'engine {self.get_type()} preparing ...')

        # check language data before execute function, not here.
        self.engine_ocr_lang = engine_ocr_lang or self.DEFAULT_LANGUAGE
        self.engine_ocr_tess_data_dir, self.engine_ocr_available_lang_list = tesserocr.get_languages(
        )

        logger.debug(f'target lang: {self.engine_ocr_lang}')
        logger.debug(f'tess data dir: {self.engine_ocr_tess_data_dir}')
        logger.debug(
            f'available language: {self.engine_ocr_available_lang_list}')
        logger.info(f'engine {self.get_type()} loaded')

示例#7

0

显示文件

    def __init__(self, **kwargs):
        self.trainers_predictors_list = []
        self.text_predictors_list = [
            ("previous_level", (1212, 231, 1230, 280), "0123456789", "8"),
            ("main_level", (1203, 323, 1223, 399), "0123456789", "8"),
            ("next_level", (1212, 445, 1230, 493), "0123456789", "8"),
            ("sub_level", (1177, 625, 1203, 692), "0123456789/", "8"),
            ("gold", (1091, 283, 1126, 471),
             "0123456789.abcdefghijklmnopqrstuvwxyz", "7"),
            ("current_dps_down_no_tab", (389, 562, 423, 709),
             "0123456789.abcdefghijklmnopqrstuvwxyz", "8"),
            ("last_hero", (124, 109, 148, 430),
             "0123456789.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
             "7")
        ]
        self.api = PyTessBaseAPI()
        self.api.Init()
        print(tesserocr.tesseract_version())
        print(tesserocr.get_languages())
        self.global_image = None
        self.status = CurrentStatus()

        boss_trainer = TrainerPredictor(
            "boss_active_predictor",
            ["boss_active", "boss_inactive", "no_boss"],
            (1224, 555, 1248, 648), 12, 46, 255.0, [200, 30])
        egg_trainer = TrainerPredictor("egg_active_predictor",
                                       ["egg_active", "egg_inactive"],
                                       (741, 31, 761, 64), 10, 16, 255.0,
                                       [200, 30])
        gold_pet_trainer = TrainerPredictor(
            "gold_pet_predictor",
            ["goldpet", "nopet", "normalpet", "partial pet"],
            (624, 364, 734, 474), 40, 40, 255.0, [200, 30])
        tab_predictor = TrainerPredictor("tab_predictor", [
            "skills_tab", "heroes_tab", "equipment_tab", "pet_tab",
            "relic_tab", "shop_tab", "no_tab"
        ], (51, 1, 59, 717), 2, 179, 255.0, [200, 30])
        self.trainers_predictors_list.append(boss_trainer)
        self.trainers_predictors_list.append(egg_trainer)
        self.trainers_predictors_list.append(gold_pet_trainer)
        self.trainers_predictors_list.append(tab_predictor)
        for trainer in self.trainers_predictors_list:
            pass
            #trainer.crop_images()
            #trainer.process_images()
            #trainer.read_and_pickle()
            #trainer.train_graph()
        saved_classes_file = glo.DATA_FOLDER + "/dataforclassifier/TrainerPredictor_list.pickle"
        save_pickle(saved_classes_file, self.trainers_predictors_list)

示例#8

0

显示文件

文件： YoloCR.py 项目： Ryu1845/YoloCR

 def __init__(self, config):
     self.source_file = config["source_file"]
     self.crop_box_dimensions = config["crop"]["crop_box_dimension"]
     self.height_crop_box = config["crop"]["crop_box_height"]
     self.height_crop_box_alt = config["crop"]["crop_box_height_alt"]
     self.ss_factor = config["upscale"]["supersampling_factor"]
     self.expand_ratio = config["upscale"]["expand_ratio"]
     self.upscale_mode = config["upscale"]["upscale_mode"]
     self.inline_threshold = config["threshold"]["inline_threshold"]
     self.outline_threshold = config["threshold"]["outline_threshold"]
     self.scd_threshold = config["threshold"]["scd_threshold"]
     self.language = config["language"]
     self.sub_count = 0
     if os.path.exists(f'data/tessdata/{self.language}.traineddata'):
         self.tessdata = 'data/tessdata'
     else:
         self.tessdata, _ = tesserocr.get_languages()

示例#9

0

显示文件

 def test_detect_os(self):
     """Test DetectOS and DetectOrientationScript (tesseract v4+)."""
     self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY)
     self._api.SetImageFile(self._image_file)
     orientation = self._api.DetectOS()
     all(
         self.assertIn(k, orientation)
         for k in ["sconfidence", "oconfidence", "script", "orientation"])
     self.assertEqual(orientation["orientation"], 0)
     # this is sorted alphabetically!
     languages = tesserocr.get_languages()[1]
     self.assertLess(orientation["script"], len(languages))
     # therefore does not work
     # script_name = languages[orientation["script"]]
     # self.assertEqual(script_name, 'Latin') # cannot test: not reliable
     if _TESSERACT_VERSION >= 0x3999800:
         orientation = self._api.DetectOrientationScript()
         all(
             self.assertIn(k, orientation) for k in
             ["orient_deg", "orient_conf", "script_name", "script_conf"])
         self.assertEqual(orientation["orient_deg"], 0)
         self.assertEqual(orientation["script_name"], "Latin")

示例#10

0

显示文件

def tesseract_ocr_supported_languages():
    """
    tesseract ocr supported languages

    supported languages of tesseract ocr
    ---
    tags:
        - ocr
    responses:
        200:
            description: a list of supported languages
            schema:
                type: object
                properties:
                    code:
                        type: integer
                    supported_languages:
                        type: array
                        items:
                            type: string
            examples:
                supported_languages: {"code":200,"supported_languages":["eng"]}
    """
    return {"code": 200, "supported_languages": tesserocr.get_languages()[1]}

示例#11

0

显示文件

# -*-coding:utf8-*-#

__author__ = 'play4fun'
"""
create time:16/10/21 11:47
"""

import tesserocr
from PIL import Image

print(tesserocr.tesseract_version())  # print tesseract-ocr version
print(tesserocr.get_languages()
      )  # prints tessdata path and list of available languages

image = Image.open('sample.jpg')
print(tesserocr.image_to_text(image))  # print ocr text from image
# or
print(tesserocr.file_to_text('sample.jpg'))

示例#12

0

显示文件

文件： recognize.py 项目： occrp-attic/recognize-text

 def __init__(self):
     # Tesseract language types:
     _, self.supported = get_languages()

示例#13

0

显示文件

文件： __init__.py 项目： williamfzc/findtext

 def get_available_lang() -> list:
     return tesserocr.get_languages()[1]

示例#14

0

显示文件

文件： recognize.py 项目： seekersapp2013/aleph

 def __init__(self):
     # Tesseract language types:
     _, self.supported = get_languages()
     self.tl = threading.local()

示例#15

0

显示文件

 def process(self):
     """
     Performs the (text) recognition.
     """
     # print(self.parameter)
     log.debug("TESSDATA: %s, installed tesseract models: %s",
               *get_languages())
     maxlevel = self.parameter['textequiv_level']
     model = get_languages()[1][-1]  # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model +
                             " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model '%s' in %s for recognition at the %s level",
                  model,
                  get_languages()[0], maxlevel)
         # todo: populate GetChoiceIterator() with LSTM models, too:
         #tessapi.SetVariable("lstm_choice_mode", "2")
         # todo: determine relevancy of these variables:
         # tessapi.SetVariable("tessedit_single_match", "0")
         #
         # tessedit_load_sublangs
         # tessedit_preserve_min_wd_len 2
         # tessedit_prefer_joined_punct 0
         # tessedit_write_rep_codes 0
         # tessedit_parallelize 0
         # tessedit_zero_rejection 0
         # tessedit_zero_kelvin_rejection 0
         # tessedit_reject_mode 0
         # tessedit_use_reject_spaces 1
         # tessedit_fix_fuzzy_spaces 1
         # tessedit_char_blacklist
         # tessedit_char_whitelist
         # chs_leading_punct ('`"
         # chs_trailing_punct1 ).,;:?!
         # chs_trailing_punct2 )'`"
         # numeric_punctuation .,
         # unrecognised_char |
         # ok_repeated_ch_non_alphanum_wds -?*=
         # conflict_set_I_l_1 Il1[]
         # preserve_interword_spaces 0
         # tessedit_enable_dict_correction 0
         # tessedit_enable_bigram_correction 1
         # stopper_smallword_size 2
         # wordrec_max_join_chunks 4
         # suspect_space_level 100
         # suspect_short_words 2
         # language_model_ngram_on 0
         # language_model_ngram_order 8
         # language_model_min_compound_length 3
         # language_model_penalty_non_freq_dict_word 0.1
         # language_model_penalty_non_dict_word 0.15
         # language_model_penalty_punc 0.2
         # language_model_penalty_case 0.1
         # language_model_penalty_script 0.5
         # language_model_penalty_chartype 0.3
         # language_model_penalty_spacing 0.05
         # textord_max_noise_size 7
         # enable_noise_removal 1
         # classify_bln_numeric_mode 0
         # lstm_use_matrix 1
         # user_words_file
         # user_patterns_file
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(
                 pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             metadata = pcgts.get_Metadata()  # ensured by from_file()
             metadata.add_MetadataItem(
                 MetadataItemType(
                     type_="processingStep",
                     name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']
                     ['steps'][0],
                     value='ocrd-tesserocr-recognize',
                     Labels=[
                         LabelsType(externalRef="parameters",
                                    Label=[
                                        LabelType(
                                            type_=name,
                                            value=self.parameter[name])
                                        for name in self.parameter.keys()
                                    ])
                     ]))
             log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId())
             regions = pcgts.get_Page().get_TextRegion()
             if not regions:
                 log.warning("Page contains no text regions")
             self._process_regions(regions, maxlevel, tessapi)
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts),
             )

示例#16

0

显示文件

 def process(self):
     """
     Performs the (text) recognition.
     """
     print(self.parameter)
     if self.parameter['textequiv_level'] not in ['line', 'glyph']:
         raise Exception("currently only implemented at the line/glyph level")
     model = get_languages()[1][-1] # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model + " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model %s in %s for recognition", model, get_languages()[0])
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = from_file(self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             # TODO slow
             #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
             log.info("page %s", pcgts)
             for region in pcgts.get_Page().get_TextRegion():
                 textlines = region.get_TextLine()
                 log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id)
                 for line in textlines:
                     log.debug("Recognizing text in line '%s'", line.id)
                     xywh = xywh_from_points(line.get_Coords().points)
                     tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                     tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                     #  log.debug("xywh: %s", xywh)
                     line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                     #  tessapi.G
                     #  print(tessapi.AllWordConfidences())
                     if self.parameter['textequiv_level'] == 'glyph':
                         for word in line.get_Word():
                             log.debug("Recognizing text in word '%s'", word.id)
                             xywh = xywh_from_points(word.get_Coords().points)
                             tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h'])
                             tessapi.SetPageSegMode(PSM.SINGLE_WORD)
                             word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text()))
                             result_it = tessapi.GetIterator()
                             for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)):
                                 #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice?
                                 #conf = result.Confidence(RIL.SYMBOL) # is first choice?
                                 bbox = result.BoundingBox(RIL.SYMBOL)
                                 if bbox == None:
                                     continue
                                 glyph_id = '%s_glyph%04d' % (word.id, result_no)
                                 log.debug("Recognizing text in glyph '%s'", glyph_id)
                                 glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox)))
                                 word.add_Glyph(glyph)
                                 choice_it = result.GetChoiceIterator()
                                 for (choice_no, choice) in enumerate(choice_it):
                                     alternative_symb = choice.GetUTF8Text()
                                     alternative_conf = choice.Confidence()
                                     glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb))
             ID = concat_padded(self.output_file_grp, n)
             self.add_output_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 basename=ID + '.xml',
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts),
             )

示例#17

0

显示文件

文件： ocr_language.py 项目： loftwah/erpnext_ocr

def lang_available(lang):
    """Call Tesseract OCR to verify language is available."""
    if lang == 'en':
        lang = "eng"
    list_of_languages = tesserocr.get_languages()[1]
    return lang in list_of_languages

示例#18

0

显示文件

文件： app.py 项目： MaTriXy/teleocrbot

    if not commandtext:
        return
    commands = commandtext.split()
    arglen = len(commands)
    if arglen == 1 and commands[0] == '/lang':
        bot.sendMessage(msg['chat']['id'],
                        supported_lang,
                        reply_to_message_id=msg['message_id'])
    if arglen == 2 and commands[0] == '/ocr':
        if commands[1] not in languages:
            sendReply(msg, "Unsupported language specified")
            return
        if 'reply_to_message' in msg:
            processOCR(msg['reply_to_message'], commands[1])
        else:
            sendReply(msg, "Must be used in reply to a picture based message")


language = tesserocr.get_languages()
for lang in language[1]:
    if lang != 'osd' and lang != 'equ':
        languages.append(lang)

supported_lang = ','.join(languages)

pprint.pprint(bot.getMe())
telepot.loop.MessageLoop(bot, handle).run_as_thread()

while 1:
    time.sleep(10)

示例#19

0

显示文件

文件： recognize.py 项目： kba/ocrd_tesserocr

from __future__ import absolute_import

from tesserocr import PyTessBaseAPI, PSM, get_languages
from ocrd.utils import getLogger, mets_file_id, xywh_from_points
from ocrd.model.ocrd_page import from_file, to_xml, TextEquivType
from ocrd import Processor, MIMETYPE_PAGE
from ocrd_tesserocr.config import TESSDATA_PREFIX

log = getLogger('processor.TesserocrRecognize')

DEFAULT_MODEL = get_languages()[1][-1]

class TesserocrRecognize(Processor):

    def process(self):
        """
        Performs the (text) recognition.
        """
        print(self.parameter)
        with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi:
            log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1])
            for (n, input_file) in enumerate(self.input_files):
                log.info("INPUT FILE %i / %s", n, input_file)
                pcgts = from_file(self.workspace.download_file(input_file))
                # TODO use binarized / gray
                pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                tessapi.SetImage(pil_image)
                # TODO slow
                #  tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                log.info("page %s", pcgts)
                for region in pcgts.get_Page().get_TextRegion():

示例#20

0

显示文件

def detect_text_tess(path):
    print(tesserocr.tesseract_version())  # print tesseract-ocr version
    print(tesserocr.get_languages()
          )  # prints tessdata path and list of available languages
    return tesserocr.file_to_text(path, lang='eng')

示例#21

0

显示文件

import os
import json
from pkg_resources import resource_string

import tesserocr

TESSDATA_PREFIX = os.environ[
    'TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages(
    )[0]

OCRD_TOOL = json.loads(
    resource_string(__name__, 'ocrd-tool.json').decode('utf8'))

示例#22

0

显示文件

文件： languages.py 项目： dkhurshudian/aleph

import tesserocr
from languagecodes import list_to_alpha3

# Tesseract language types:
_, LANGUAGES = tesserocr.get_languages()


def get_languages(codes):
    """Turn some ISO2 language codes into ISO3 codes."""
    supported = []
    for code in list_to_alpha3(codes):
        if code in LANGUAGES:
            supported.append(code)
    return '+'.join(sorted(supported))

示例#23

0

显示文件

文件： main.py 项目： jsrdzhk/Tesser

@Date: 2020-05-13 10:58:02
@LastEditors: Rodney Cheung
@LastEditTime: 2020-05-15 18:09:00
@FilePath: /Tesser/main.py
'''
import tesserocr
from tesserocr import PyTessBaseAPI, RIL
from PIL import Image, ImageOps
import os
import argparse

TESSDATA_PATH = '/Volumes/code/open_source/tessdata_best/'
# TESTDATA_PATH = '/Volumes/code/work/wq_maintain_material/picture/test_data'

print(tesserocr.tesseract_version())
print(tesserocr.get_languages(path=TESSDATA_PATH))


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def main():
    parser = argparse.ArgumentParser(description='tesserocr interface')

示例#24

0

显示文件

文件： ocr.py 项目： aichouramine/VehicleInsurance

 def getinfo():
     print(tesserocr.tesseract_version())  # print tesseract-ocr version
     print(tesserocr.get_languages())

示例#25

0

显示文件

 def __init__(self):
     self.thread = local()
     _, self.supported_languages = get_languages()

示例#26

0

显示文件

文件： recognize.py 项目： gavinrozzi/aleph

 def __init__(self):
     # Tesseract language types:
     _, self.supported = get_languages()
     self.reset_engine('eng')

示例#27

0

显示文件

文件： tesserocr_example_has_confidence.py 项目： rdrake/InlineViz

images = ['sample.jpg', 'sample2.jpg', 'sample3.jpg']

with PyTessBaseAPI() as api:
    for img in images:
        api.SetImageFile(img)
        print api.GetUTF8Text()
        print api.AllWordConfidences()
# api is automatically finalized when used in a with-statement (context manager).
# otherwise api.End() should be explicitly called when it's no longer needed.

#Basic Usage
import tesserocr
from PIL import Image

print tesserocr.tesseract_version()  # print tesseract-ocr version
print tesserocr.get_languages(
)  # prints tessdata path and list of available languages

image = Image.open('sample.jpg')
print tesserocr.image_to_text(image)  # print ocr text from image
# or
print tesserocr.file_to_text('sample.jpg')

#Advanced API
from PIL import Image
from tesserocr import PyTessBaseAPI

image = Image.open('/usr/src/tesseract/testing/phototest.tif')
with PyTessBaseAPI() as api:
    api.SetImage(image)
    boxes = api.GetComponentImages(RIL.TEXTLINE, True)
    print 'Found {} textline image components.'.format(len(boxes))

示例#28

0

显示文件

文件： FileToText.py 项目： LAOWANG1223/FreeManualInputMVP

import tesserocr as tc
from PIL import Image

'''
通过添加字体库支持新的语言和字体
C:\\Users\\admin\\AppData\\Local\\Programs\\Python\\Python37-32\\/tessdata/'
'''

class OcrTools:
    def __init__(self):
        demo  = ''

print(tc.tesseract_version())  # print tesseract-ocr version
print(tc.get_languages())  # prints tessdata path and list of available languages

filename = 'data/news.png'

en_filename = 'data/testp.png'

image = Image.open(filename)

#print(tc.image_to_text(image))  # print ocr text from image
# or
#标准中文图片
print('---------------------标准中文图片---------------------')
print(tc.file_to_text(filename,lang='chi_sim'))
#标准英文图片
print('---------------------标准英文图片---------------------')
print(tc.file_to_text(en_filename))

示例#29

0

显示文件

文件： tmp.py 项目： ychaim/VehicleInsurance

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Oct  5 05:06:47 2017

@author: ubuntu
"""
##
sample_db_path = "./sample/"
test_db_path = "/media/ubuntu/Investigation/DataSet/Image/Classification/Insurance/Insurance/Tmp/VIN/"
filename = "1.jpg"
fullpath = test_db_path + filename
###

import cv2
import numpy as np
import tesserocr
from PIL import Image

def opencv2pillow(image):
    return Image.fromarray(image)
print tesserocr.tesseract_version()  # print tesseract-ocr version
print tesserocr.get_languages()
image = cv2.imread(fullpath)
image = opencv2pillow(image)
print tesserocr.image_to_text(image)

示例#30

0

显示文件

文件： __init__.py 项目： williamfzc/findtext

 def get_data_home() -> str:
     return tesserocr.get_languages()[0]