def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the (text) recognition. """ print(self.parameter) with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=DEFAULT_MODEL) as tessapi: log.info("Using model %s in %s for recognition", get_languages()[0], get_languages()[1][-1]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) ID = mets_file_id(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): points, index = points_from_xywh( component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion( TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate( tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word( WordType(id=word_id, Coords=CoordsType( points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def write_alignment_to_xml(self): """ Write the alignments into new output-file-group. The alignment is done by the master file (first index) """ self.log.info("writing alignment to %s", self.process.output_file_grp) master = self.ifs[0] pcgts = from_file(self.process.workspace.download_file(master)) ilist = iter(self.line_alignments) for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): self.log.info("line: %s", line.get_TextEquiv()[0].Unicode) line.get_TextEquiv()[0].set_index(0) current = next(ilist) self.add_line_alignments(line, current) self.add_word_alignments(line, current) self.log.debug("master basename: %s", master.basename) self.process.add_output_file( ID="{}_{}".format(master.ID, self.process.output_file_grp), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), file_grp=self.process.output_file_grp, basename=master.basename, )
def process(self): """ Performs the (text) recognition. """ print(self.parameter) if self.parameter['textequiv_level'] not in ['line', 'glyph']: raise Exception("currently only implemented at the line/glyph level") model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model %s in %s for recognition", model, get_languages()[0]) for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("page %s", pcgts) for region in pcgts.get_Page().get_TextRegion(): textlines = region.get_TextLine() log.info("About to recognize text in %i lines of region '%s'", len(textlines), region.id) for line in textlines: log.debug("Recognizing text in line '%s'", line.id) xywh = xywh_from_points(line.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_LINE) # log.debug("xywh: %s", xywh) line.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) # tessapi.G # print(tessapi.AllWordConfidences()) if self.parameter['textequiv_level'] == 'glyph': for word in line.get_Word(): log.debug("Recognizing text in word '%s'", word.id) xywh = xywh_from_points(word.get_Coords().points) tessapi.SetRectangle(xywh['x'], xywh['y'], xywh['w'], xywh['h']) tessapi.SetPageSegMode(PSM.SINGLE_WORD) word.add_TextEquiv(TextEquivType(Unicode=tessapi.GetUTF8Text())) result_it = tessapi.GetIterator() for (result_no, result) in enumerate(iterate_level(result_it, RIL.SYMBOL)): #symb = result.GetUTF8Text(RIL.SYMBOL) # is first choice? #conf = result.Confidence(RIL.SYMBOL) # is first choice? bbox = result.BoundingBox(RIL.SYMBOL) if bbox == None: continue glyph_id = '%s_glyph%04d' % (word.id, result_no) log.debug("Recognizing text in glyph '%s'", glyph_id) glyph = GlyphType(id=glyph_id, Coords=CoordsType(points_from_x0y0x1y1(bbox))) word.add_Glyph(glyph) choice_it = result.GetChoiceIterator() for (choice_no, choice) in enumerate(choice_it): alternative_symb = choice.GetUTF8Text() alternative_conf = choice.Confidence() glyph.add_TextEquiv(TextEquivType(index=choice_no, conf=alternative_conf, Unicode=alternative_symb)) ID = concat_padded(self.output_file_grp, n) self.add_output_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) tessapi.SetVariable( "glyph_confidences", "2") # populate GetChoiceIterator() with LSTM models, too # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = from_file(self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] ['steps'][0], value='ocrd-tesserocr-recognize', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # TODO slow # tessapi.SetPageSegMode(PSM.SINGLE_LINE) log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) page = pcgts.get_Page() if maxlevel == 'page': # not sure what to do here: # - We cannot simply do GetUTF8Text(), because there is no TextEquiv on the page level. # - We could GetComponentImages(RIL.BLOCK) and add a text region for each, then enter region level recognition below. But what if regions are already annotated? How to go about non-text blocks? raise Exception( "currently only implemented below the page level") ## region, line, word, or glyph level: regions = page.get_TextRegion() if not regions: log.warning("Page contains no text regions") self._process_regions(regions, maxlevel, tessapi) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, basename=ID + '.xml', mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )