Пример #1
0
    def process(self):
        network_file = self.parameter['network']
        stride = self.parameter['stride']
        classifier = TypegroupsClassifier.load(network_file)

        ignore_type = ('Adornment', 'Book covers and other irrelevant data',
                       'Empty Pages', 'Woodcuts - Engravings')

        self.log.debug('Processing: %s', self.input_files)
        for (_, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            pil_image = self.workspace.resolve_image_as_pil(image_url)
            result = classifier.run(pil_image, stride)
            score_sum = 0
            for typegroup in classifier.classMap.cl2id:
                if not typegroup in ignore_type:
                    score_sum += max(0, result[typegroup])

            script_highscore = 0
            noise_highscore = 0
            result_map = {}
            output = ''
            for typegroup in classifier.classMap.cl2id:
                score = result[typegroup]
                if typegroup in ignore_type:
                    noise_highscore = max(noise_highscore, score)
                else:
                    script_highscore = max(script_highscore, score)
                    normalised_score = max(0, score / score_sum)
                    result_map[normalised_score] = typegroup
            if noise_highscore > script_highscore:
                pcgts.get_Page().set_primaryScript(None)
                self.log.debug(
                    'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s',
                    noise_highscore, script_highscore)
            else:
                for k in sorted(result_map, reverse=True):
                    intk = round(100 * k)
                    if intk <= 0:
                        continue
                    if output != '':
                        output = '%s, ' % output
                    output = '%s%s:%d' % (output, result_map[k], intk)
                self.log.debug('Detected %s' % output)
                page = pcgts.get_Page()
                textStyle = page.get_TextStyle()
                if not textStyle:
                    textStyle = TextStyleType()
                    page.set_TextStyle(textStyle)
                textStyle.set_fontFamily(output)
                ID = concat_padded(self.output_file_grp, input_file.ID)
                self.workspace.add_file(ID=ID,
                                        file_grp=self.output_file_grp,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename="%s/%s" %
                                        (self.output_file_grp, ID),
                                        content=to_xml(pcgts))
Пример #2
0
 def _process_words_in_line(self, result_it, line, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.WORD):
         LOG.warning("No text in line '%s'", line.id)
         return
     # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
     word_no = 0
     while result_it and not result_it.Empty(RIL.WORD):
         word_id = '%s_word%04d' % (line.id, word_no)
         LOG.debug("Decoding text in word '%s'", word_id)
         bbox = result_it.BoundingBox(RIL.WORD)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, line_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, line)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         word = WordType(id=word_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant word: %s', points)
         else:
             line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(TextEquivType(
             Unicode=result_it.GetUTF8Text(RIL.WORD),
             conf=result_it.Confidence(RIL.WORD)/100))
         if self.parameter['textequiv_level'] != 'word':
             self._process_glyphs_in_word(result_it, word, line_xywh)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             word_no += 1
             result_it.Next(RIL.WORD)
Пример #3
0
 def _process_words_in_line(self, line, maxlevel, result_it):
     for word_no in range(
             0, MAX_ELEMENTS
     ):  # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD)
         if not result_it:
             log.error("No iterator at '%s'", line.id)
             break
         if result_it.Empty(RIL.WORD):
             log.debug("No word here")
             break
         word_id = '%s_word%04d' % (line.id, word_no)
         log.debug("Recognizing text in word '%s'", word_id)
         word_bbox = result_it.BoundingBox(RIL.WORD)
         word = WordType(id=word_id,
                         Coords=CoordsType(points_from_x0y0x1y1(word_bbox)))
         line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=None if 'bold' not in word_attributes else
                 word_attributes['bold'],
                 italic=None if 'italic' not in word_attributes else
                 word_attributes['italic'],
                 underlined=None if 'underlined' not in word_attributes else
                 word_attributes['underlined'],
                 monospace=None if 'monospace' not in word_attributes else
                 word_attributes['monospace'],
                 serif=None if 'serif' not in word_attributes else
                 word_attributes['serif'])
             word.set_TextStyle(
                 word_style)  # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(
             TextEquivType(Unicode=result_it.GetUTF8Text(RIL.WORD),
                           conf=result_it.Confidence(RIL.WORD) / 100))
         if maxlevel == 'word':
             pass
         else:
             self._process_glyphs_in_word(word, result_it)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             result_it.Next(RIL.WORD)
Пример #4
0
 def _process_words(self, tessapi, words, line_image, line_coords):
     LOG = getLogger('processor.TesserocrFontShape')
     for word in words:
         word_image, word_coords = self.workspace.image_from_segment(
             word, line_image, line_coords)
         if self.parameter['padding']:
             tessapi.SetImage(
                 pad_image(word_image, self.parameter['padding']))
         else:
             tessapi.SetImage(word_image)
         tessapi.SetPageSegMode(PSM.SINGLE_WORD)
         #tessapi.SetPageSegMode(PSM.RAW_LINE)
         tessapi.Recognize()
         result_it = tessapi.GetIterator()
         if not result_it or result_it.Empty(RIL.WORD):
             LOG.warning("No text in word '%s'", word.id)
             continue
         LOG.debug("Decoding text in word '%s'", word.id)
         # trigger recognition
         word_text = result_it.GetUTF8Text(RIL.WORD)
         LOG.debug('Word "%s" detected "%s"', word.id, word_text)
         textequiv = word.get_TextEquiv()
         if textequiv:
             LOG.info('Word "%s" annotated "%s" / detected "%s"', word.id,
                      textequiv[0].Unicode, word_text)
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             #LOG.debug("found font attributes: {}".format(word_attributes))
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(
                 word_style)  # (or somewhere in custom attribute?)
Пример #5
0
def test_styles_from_textstyle():
    m = TextStylesManager(alto_version='4')
    textstyle = TextStyleType(fontFamily='Times New Roman', serif=True, textColourRgb=6559300)
    print(m.from_textstyle(textstyle))
Пример #6
0
    def process(self):
        """Classify historic script in pages and annotate as font style.

        Open and deserialize PAGE input files and their respective images.
        Then for each page, retrieve the raw image and feed it to the font
        classifier. 

        Post-process detections by filtering classes and thresholding scores.
        Annotate the good predictions by name and score as a comma-separated
        list under ``/PcGts/Page/TextStyle/@fontFamily``, if any.

        Produce a new PAGE output file by serialising the resulting hierarchy.
        """
        log = getLogger('ocrd_typegroups_classifier')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        if 'network' not in self.parameter:
            self.parameter['network'] = resource_filename(
                __name__, 'models/densenet121.tgc')
        network_file = self.resolve_resource(self.parameter['network'])
        stride = self.parameter['stride']
        classifier = TypegroupsClassifier.load(network_file)

        ignore_type = ('Adornment', 'Book covers and other irrelevant data',
                       'Empty Pages', 'Woodcuts - Engravings')

        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info('Processing: %d / %s', n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            pil_image, _, image_info = self.workspace.image_from_page(
                # prefer raw image (to meet expectation of the models, which
                # have been trained on RGB images with both geometry and color
                # transform random augmentation)
                # maybe even: dewarped,deskewed ?
                page,
                page_id,
                feature_filter=
                'binarized,normalized,grayscale_normalized,despeckled')
            # todo: use image_info.resolution
            result = classifier.run(pil_image, stride)
            score_sum = 0
            for typegroup in classifier.classMap.cl2id:
                if not typegroup in ignore_type:
                    score_sum += max(0, result[typegroup])

            script_highscore = 0
            noise_highscore = 0
            result_map = {}
            output = ''
            for typegroup in classifier.classMap.cl2id:
                score = result[typegroup]
                if typegroup in ignore_type:
                    noise_highscore = max(noise_highscore, score)
                else:
                    script_highscore = max(script_highscore, score)
                    normalised_score = max(0, score / score_sum)
                    result_map[normalised_score] = typegroup
            if noise_highscore > script_highscore:
                page.set_primaryScript(None)
                log.warning(
                    'Detected only noise on page %s. noise_highscore=%s > script_highscore=%s',
                    page_id, noise_highscore, script_highscore)
            else:
                for k in sorted(result_map, reverse=True):
                    intk = round(100 * k)
                    if intk <= 0:
                        continue
                    if output != '':
                        output = '%s, ' % output
                    output = '%s%s:%d' % (output, result_map[k], intk)
                log.debug('Detected %s' % output)
                textStyle = page.get_TextStyle()
                if not textStyle:
                    textStyle = TextStyleType()
                    page.set_TextStyle(textStyle)
                textStyle.set_fontFamily(output)
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename="%s/%s.xml" %
                                        (self.output_file_grp, file_id),
                                        content=to_xml(pcgts))