Exemplo n.º 1
0
 def _process_words_in_line(self, line, maxlevel, result_it):
     for word_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD)
         if not result_it:
             log.error("No iterator at '%s'", line.id)
             break
         if result_it.Empty(RIL.WORD):
             log.debug("No word here")
             break
         word_id = '%s_word%04d' % (line.id, word_no)
         log.debug("Recognizing text in word '%s'", word_id)
         word_bbox = result_it.BoundingBox(RIL.WORD)
         word = WordType(id=word_id,
                         Coords=CoordsType(points_from_x0y0x1y1(word_bbox)))
         line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=None if 'bold' not in word_attributes else
                 word_attributes['bold'],
                 italic=None if 'italic' not in word_attributes else
                 word_attributes['italic'],
                 underlined=None if 'underlined' not in word_attributes else
                 word_attributes['underlined'],
                 monospace=None if 'monospace' not in word_attributes else
                 word_attributes['monospace'],
                 serif=None if 'serif' not in word_attributes else
                 word_attributes['serif'])
             word.set_TextStyle(
                 word_style)  # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(
             TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD),
                           conf=result_it.Confidence(RIL.WORD) / 100))
         if maxlevel == 'word':
             pass
         else:
             self._process_glyphs_in_word(word, result_it)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             result_it.Next(RIL.WORD)
Exemplo n.º 2
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 3
0
 def _process_glyphs_in_word(self, word, result_it):
     for glyph_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.WORD, RIL.SYMBOL)
         if not result_it:
             log.error("No iterator at '%s'", word.id)
             break
         if result_it.Empty(RIL.SYMBOL):
             log.debug("No glyph here")
             break
         glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
         log.debug("Recognizing text in glyph '%s'", glyph_id)
         #  glyph_text = result_it.GetUTF8Text(RIL.SYMBOL) # equals first choice?
         glyph_conf = result_it.Confidence(
             RIL.SYMBOL) / 100  # equals first choice?
         #log.debug('best glyph: "%s" [%f]', glyph_text, glyph_conf)
         glyph_bbox = result_it.BoundingBox(RIL.SYMBOL)
         glyph = GlyphType(id=glyph_id,
                           Coords=CoordsType(
                               points_from_x0y0x1y1(glyph_bbox)))
         word.add_Glyph(glyph)
         choice_it = result_it.GetChoiceIterator()
         for (choice_no, choice) in enumerate(choice_it):
             alternative_text = choice.GetUTF8Text()
             alternative_conf = choice.Confidence() / 100
             #log.debug('alternative glyph: "%s" [%f]', alternative_text, alternative_conf)
             if (glyph_conf - alternative_conf > CHOICE_THRESHOLD_CONF
                     or choice_no > CHOICE_THRESHOLD_NUM):
                 break
             # todo: consider SymbolIsSuperscript (TextStyle), SymbolIsDropcap (RelationType) etc
             glyph.add_TextEquiv(
                 TextEquivType(index=choice_no,
                               Unicode=alternative_text,
                               conf=alternative_conf))
         if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
         else:
             result_it.Next(RIL.SYMBOL)
Exemplo n.º 4
0
 def test_points_from_x0y0x1y1(self):
     self.assertEqual(points_from_x0y0x1y1([100, 100, 200, 200]),
                      '100,100 200,100 200,200 100,200')
Exemplo n.º 5
0
    def process_lines(self, textlines, predfiles, fgrp, regionid):

        for line in textlines:

            for file in predfiles:
                if file == '-'.join([fgrp, regionid, line.id]):
                    self.log.info("Processing text in line '%s'", line.id)

                    filepath = self.root + '/' + file + '.json'
                    with open(filepath) as f:
                        data = json.load(f)

                        linepred = data['predictions'][0]['sentence']
                        line_conf = []
                        line_pos = []

                        w = ''
                        word_conf = []
                        words = []
                        word_pos = []

                        positions = data['predictions'][0]['positions']
                        for i, d in enumerate(positions):
                            char = d['chars'][0]['char']
                            char_conf = d['chars'][0]['probability']
                            char_pos = (d['globalStart'], d['globalEnd'])

                            if char == ' ':
                                words.append(w)
                                w = ''
                                line_conf.append(word_conf)
                                word_conf = []
                                line_pos.append(word_pos)
                                word_pos = []
                            else:
                                w += char
                                word_conf.append(char_conf)
                                word_pos.append(char_pos)
                                if i == len(positions) - 1:
                                    words.append(w)
                                    line_conf.append(word_conf)
                                    line_pos.append(word_pos)

                        wconfs = [(min(conf) + max(conf)) / 2
                                  for conf in line_conf]
                        lineconf = (min(wconfs) + max(wconfs)) / 2

                        line.replace_TextEquiv_at(
                            0,
                            TextEquivType(Unicode=linepred,
                                          conf=str(lineconf)))

                        if self.maxlevel == 'word' or 'glyph':
                            box = bounding_box(line.get_Coords().points)
                            line.Word = []
                            for w_no, w in enumerate(words):

                                # Coords of word
                                wordbounding = (line_pos[w_no][0][0],
                                                line_pos[w_no][-1][-1])
                                word_bbox = [
                                    box[0] + wordbounding[0], box[1],
                                    box[2] + wordbounding[1], box[3]
                                ]

                                word_id = '%s_word%04d' % (line.id, w_no)
                                word = WordType(
                                    id=word_id,
                                    Coords=CoordsType(
                                        points_from_x0y0x1y1(word_bbox)))

                                line.add_Word(word)
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=w,
                                                  conf=str(wconfs[w_no])))

                                if self.maxlevel == 'glyph':
                                    for glyph_no, g in enumerate(w):
                                        glyphbounding = (
                                            line_pos[w_no][glyph_no][0],
                                            line_pos[w_no][glyph_no][-1])
                                        glyph_bbox = [
                                            box[0] + glyphbounding[0], box[1],
                                            box[2] + glyphbounding[1], box[3]
                                        ]

                                        glyph_id = '%s_glyph%04d' % (word.id,
                                                                     glyph_no)
                                        glyph = GlyphType(
                                            id=glyph_id,
                                            Coords=CoordsType(
                                                points_from_x0y0x1y1(
                                                    glyph_bbox)))

                                        word.add_Glyph(glyph)
                                        glyph.add_TextEquiv(
                                            TextEquivType(
                                                Unicode=g,
                                                conf=str(line_conf[w_no]
                                                         [glyph_no])))