def process(self): """ Performs the (text) recognition. """ print(self.parameter) with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi: log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) ID = mets_file_id(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def is_available(cls): try: from tesserocr import get_languages path, languages = get_languages() return len(languages) > 0 except ImportError: return False
def get_languages(self, languages): if not hasattr(self, 'supported_languages'): from tesserocr import get_languages _, self.supported_languages = get_languages() codes = set(['eng']) for lang in list_to_alpha3(codes): if lang in self.supported_languages: codes.add(lang) return '+'.join(sorted(codes))
def lang_available(lang): """Call Tesseract OCR to verify language is available.""" list_of_languages = tesserocr.get_languages()[1] if len(lang) == 2: return frappe.get_doc("OCR Language", { "lang": lang }).code in list_of_languages return lang in list_of_languages
def language_list(self, languages): if not hasattr(settings, 'ocr_supported'): with temp_locale(TESSERACT_LOCALE): # Tesseract language types: from tesserocr import get_languages _, settings.ocr_supported = get_languages() # log.info("OCR languages: %r", settings.ocr_supported) models = [c for c in alpha3(languages) if c in settings.ocr_supported] if len(models) > self.MAX_MODELS: log.warning("Too many models, limit: %s", self.MAX_MODELS) models = models[:self.MAX_MODELS] models.append('eng') return '+'.join(sorted(set(models)))
def __init__(self, engine_ocr_lang: str = None, *_, **__): logger.info(f'engine {self.get_type()} preparing ...') # check language data before execute function, not here. self.engine_ocr_lang = engine_ocr_lang or self.DEFAULT_LANGUAGE self.engine_ocr_tess_data_dir, self.engine_ocr_available_lang_list = tesserocr.get_languages( ) logger.debug(f'target lang: {self.engine_ocr_lang}') logger.debug(f'tess data dir: {self.engine_ocr_tess_data_dir}') logger.debug( f'available language: {self.engine_ocr_available_lang_list}') logger.info(f'engine {self.get_type()} loaded')
def __init__(self, **kwargs): self.trainers_predictors_list = [] self.text_predictors_list = [ ("previous_level", (1212, 231, 1230, 280), "0123456789", "8"), ("main_level", (1203, 323, 1223, 399), "0123456789", "8"), ("next_level", (1212, 445, 1230, 493), "0123456789", "8"), ("sub_level", (1177, 625, 1203, 692), "0123456789/", "8"), ("gold", (1091, 283, 1126, 471), "0123456789.abcdefghijklmnopqrstuvwxyz", "7"), ("current_dps_down_no_tab", (389, 562, 423, 709), "0123456789.abcdefghijklmnopqrstuvwxyz", "8"), ("last_hero", (124, 109, 148, 430), "0123456789.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "7") ] self.api = PyTessBaseAPI() self.api.Init() print(tesserocr.tesseract_version()) print(tesserocr.get_languages()) self.global_image = None self.status = CurrentStatus() boss_trainer = TrainerPredictor( "boss_active_predictor", ["boss_active", "boss_inactive", "no_boss"], (1224, 555, 1248, 648), 12, 46, 255.0, [200, 30]) egg_trainer = TrainerPredictor("egg_active_predictor", ["egg_active", "egg_inactive"], (741, 31, 761, 64), 10, 16, 255.0, [200, 30]) gold_pet_trainer = TrainerPredictor( "gold_pet_predictor", ["goldpet", "nopet", "normalpet", "partial pet"], (624, 364, 734, 474), 40, 40, 255.0, [200, 30]) tab_predictor = TrainerPredictor("tab_predictor", [ "skills_tab", "heroes_tab", "equipment_tab", "pet_tab", "relic_tab", "shop_tab", "no_tab" ], (51, 1, 59, 717), 2, 179, 255.0, [200, 30]) self.trainers_predictors_list.append(boss_trainer) self.trainers_predictors_list.append(egg_trainer) self.trainers_predictors_list.append(gold_pet_trainer) self.trainers_predictors_list.append(tab_predictor) for trainer in self.trainers_predictors_list: pass #trainer.crop_images() #trainer.process_images() #trainer.read_and_pickle() #trainer.train_graph() saved_classes_file = glo.DATA_FOLDER + "/dataforclassifier/TrainerPredictor_list.pickle" save_pickle(saved_classes_file, self.trainers_predictors_list)
def __init__(self, config): self.source_file = config["source_file"] self.crop_box_dimensions = config["crop"]["crop_box_dimension"] self.height_crop_box = config["crop"]["crop_box_height"] self.height_crop_box_alt = config["crop"]["crop_box_height_alt"] self.ss_factor = config["upscale"]["supersampling_factor"] self.expand_ratio = config["upscale"]["expand_ratio"] self.upscale_mode = config["upscale"]["upscale_mode"] self.inline_threshold = config["threshold"]["inline_threshold"] self.outline_threshold = config["threshold"]["outline_threshold"] self.scd_threshold = config["threshold"]["scd_threshold"] self.language = config["language"] self.sub_count = 0 if os.path.exists(f'data/tessdata/{self.language}.traineddata'): self.tessdata = 'data/tessdata' else: self.tessdata, _ = tesserocr.get_languages()
def test_detect_os(self): """Test DetectOS and DetectOrientationScript (tesseract v4+).""" self._api.SetPageSegMode(tesserocr.PSM.OSD_ONLY) self._api.SetImageFile(self._image_file) orientation = self._api.DetectOS() all( self.assertIn(k, orientation) for k in ["sconfidence", "oconfidence", "script", "orientation"]) self.assertEqual(orientation["orientation"], 0) # this is sorted alphabetically! languages = tesserocr.get_languages()[1] self.assertLess(orientation["script"], len(languages)) # therefore does not work # script_name = languages[orientation["script"]] # self.assertEqual(script_name, 'Latin') # cannot test: not reliable if _TESSERACT_VERSION >= 0x3999800: orientation = self._api.DetectOrientationScript() all( self.assertIn(k, orientation) for k in ["orient_deg", "orient_conf", "script_name", "script_conf"]) self.assertEqual(orientation["orient_deg"], 0) self.assertEqual(orientation["script_name"], "Latin")
def tesseract_ocr_supported_languages(): """ tesseract ocr supported languages supported languages of tesseract ocr --- tags: - ocr responses: 200: description: a list of supported languages schema: type: object properties: code: type: integer supported_languages: type: array items: type: string examples: supported_languages: {"code":200,"supported_languages":["eng"]} """ return {"code": 200, "supported_languages": tesserocr.get_languages()[1]}
# -*-coding:utf8-*-# __author__ = 'play4fun' """ create time:16/10/21 11:47 """ import tesserocr from PIL import Image print(tesserocr.tesseract_version()) # print tesseract-ocr version print(tesserocr.get_languages() ) # prints tessdata path and list of available languages image = Image.open('sample.jpg') print(tesserocr.image_to_text(image)) # print ocr text from image # or print(tesserocr.file_to_text('sample.jpg'))
def __init__(self): # Tesseract language types: _, self.supported = get_languages()
def get_available_lang() -> list: return tesserocr.get_languages()[1]
def __init__(self): # Tesseract language types: _, self.supported = get_languages() self.tl = threading.local()
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) # todo: populate GetChoiceIterator() with LSTM models, too: #tessapi.SetVariable("lstm_choice_mode", "2") # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file( self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] ['steps'][0], value='ocrd-tesserocr-recognize', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) regions = pcgts.get_Page().get_TextRegion() if not regions: log.warning("Page contains no text regions") self._process_regions(regions, maxlevel, tessapi) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts), )
def process(self): """ Performs the (text) recognition. """ print(self.parameter) if self.parameter['textequiv_level'] not in ['line', 'glyph']: raise Exception("currently only implemented at the line/glyph level") model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model %s in %s for recognition", model, get_languages()[0]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_LINE) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) if self.parameter['textequiv_level'] == 'glyph': for word in line.get_Word(): log.debug("Recognizing text in word '%s'", word.id) xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) result_it = tessapi.GetIterator() for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)): #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice? #conf = result.Confidence(RIL.SYMBOL) # is first choice? bbox = result.BoundingBox(RIL.SYMBOL) if bbox == None: continue glyph_id = '%s_glyph%04d' % (word.id, result_no) log.debug("Recognizing text in glyph '%s'", glyph_id) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox))) word.add_Glyph(glyph) choice_it = result.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_symb = choice.GetUTF8Text() alternative_conf = choice.Confidence() glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb)) ID = concat_padded(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def lang_available(lang): """Call Tesseract OCR to verify language is available.""" if lang == 'en': lang = "eng" list_of_languages = tesserocr.get_languages()[1] return lang in list_of_languages
if not commandtext: return commands = commandtext.split() arglen = len(commands) if arglen == 1 and commands[0] == '/lang': bot.sendMessage(msg['chat']['id'], supported_lang, reply_to_message_id=msg['message_id']) if arglen == 2 and commands[0] == '/ocr': if commands[1] not in languages: sendReply(msg, "Unsupported language specified") return if 'reply_to_message' in msg: processOCR(msg['reply_to_message'], commands[1]) else: sendReply(msg, "Must be used in reply to a picture based message") language = tesserocr.get_languages() for lang in language[1]: if lang != 'osd' and lang != 'equ': languages.append(lang) supported_lang = ','.join(languages) pprint.pprint(bot.getMe()) telepot.loop.MessageLoop(bot, handle).run_as_thread() while 1: time.sleep(10)
from __future__ import absolute_import from tesserocr import PyTessBaseAPI, PSM, get_languages from ocrd.utils import getLogger, mets_file_id, xywh_from_points from ocrd.model.ocrd_page import from_file, to_xml, TextEquivType from ocrd import Processor, MIMETYPE_PAGE from ocrd_tesserocr.config import TESSDATA_PREFIX log = getLogger('processor.TesserocrRecognize') DEFAULT_MODEL = get_languages()[1][-1] class TesserocrRecognize(Processor): def process(self): """ Performs the (text) recognition. """ print(self.parameter) with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi: log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion():
def detect_text_tess(path): print(tesserocr.tesseract_version()) # print tesseract-ocr version print(tesserocr.get_languages() ) # prints tessdata path and list of available languages return tesserocr.file_to_text(path, lang='eng')
import os import json from pkg_resources import resource_string import tesserocr TESSDATA_PREFIX = os.environ[ 'TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else tesserocr.get_languages( )[0] OCRD_TOOL = json.loads( resource_string(__name__, 'ocrd-tool.json').decode('utf8'))
import tesserocr from languagecodes import list_to_alpha3 # Tesseract language types: _, LANGUAGES = tesserocr.get_languages() def get_languages(codes): """Turn some ISO2 language codes into ISO3 codes.""" supported = [] for code in list_to_alpha3(codes): if code in LANGUAGES: supported.append(code) return '+'.join(sorted(supported))
@Date: 2020-05-13 10:58:02 @LastEditors: Rodney Cheung @LastEditTime: 2020-05-15 18:09:00 @FilePath: /Tesser/main.py ''' import tesserocr from tesserocr import PyTessBaseAPI, RIL from PIL import Image, ImageOps import os import argparse TESSDATA_PATH = '/Volumes/code/open_source/tessdata_best/' # TESTDATA_PATH = '/Volumes/code/work/wq_maintain_material/picture/test_data' print(tesserocr.tesseract_version()) print(tesserocr.get_languages(path=TESSDATA_PATH)) def str2bool(v): if isinstance(v, bool): return v if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') def main(): parser = argparse.ArgumentParser(description='tesserocr interface')
def getinfo(): print(tesserocr.tesseract_version()) # print tesseract-ocr version print(tesserocr.get_languages())
def __init__(self): self.thread = local() _, self.supported_languages = get_languages()
def __init__(self): # Tesseract language types: _, self.supported = get_languages() self.reset_engine('eng')
images = ['sample.jpg', 'sample2.jpg', 'sample3.jpg'] with PyTessBaseAPI() as api: for img in images: api.SetImageFile(img) print api.GetUTF8Text() print api.AllWordConfidences() # api is automatically finalized when used in a with-statement (context manager). # otherwise api.End() should be explicitly called when it's no longer needed. #Basic Usage import tesserocr from PIL import Image print tesserocr.tesseract_version() # print tesseract-ocr version print tesserocr.get_languages( ) # prints tessdata path and list of available languages image = Image.open('sample.jpg') print tesserocr.image_to_text(image) # print ocr text from image # or print tesserocr.file_to_text('sample.jpg') #Advanced API from PIL import Image from tesserocr import PyTessBaseAPI image = Image.open('/usr/src/tesseract/testing/phototest.tif') with PyTessBaseAPI() as api: api.SetImage(image) boxes = api.GetComponentImages(RIL.TEXTLINE, True) print 'Found {} textline image components.'.format(len(boxes))
import tesserocr as tc from PIL import Image ''' 通过添加字体库支持新的语言和字体 C:\\Users\\admin\\AppData\\Local\\Programs\\Python\\Python37-32\\/tessdata/' ''' class OcrTools: def __init__(self): demo = '' print(tc.tesseract_version()) # print tesseract-ocr version print(tc.get_languages()) # prints tessdata path and list of available languages filename = 'data/news.png' en_filename = 'data/testp.png' image = Image.open(filename) #print(tc.image_to_text(image)) # print ocr text from image # or #标准中文图片 print('---------------------标准中文图片---------------------') print(tc.file_to_text(filename,lang='chi_sim')) #标准英文图片 print('---------------------标准英文图片---------------------') print(tc.file_to_text(en_filename))
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Thu Oct 5 05:06:47 2017 @author: ubuntu """ ## sample_db_path = "./sample/" test_db_path = "/media/ubuntu/Investigation/DataSet/Image/Classification/Insurance/Insurance/Tmp/VIN/" filename = "1.jpg" fullpath = test_db_path + filename ### import cv2 import numpy as np import tesserocr from PIL import Image def opencv2pillow(image): return Image.fromarray(image) print tesserocr.tesseract_version() # print tesseract-ocr version print tesserocr.get_languages() image = cv2.imread(fullpath) image = opencv2pillow(image) print tesserocr.image_to_text(image)
def get_data_home() -> str: return tesserocr.get_languages()[0]