def test_word_segmentation(workspace): CalamariRecognize( workspace, input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint": CHECKPOINT, "textequiv_level": "word", # Note that we're going down to word level here } ).process() workspace.save_mets() page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) tree = etree.parse(page1) # The result should contain a TextLine that contains the text "December" line = tree.xpath(".//pc:TextLine[pc:TextEquiv/pc:Unicode[contains(text(),'December')]]", namespaces=NSMAP)[0] assert line # The textline should a. contain multiple words and b. these should concatenate fine to produce the same line text words = line.xpath(".//pc:Word", namespaces=NSMAP) assert len(words) >= 2 words_text = " ".join(word.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text for word in words) line_text = line.xpath("pc:TextEquiv/pc:Unicode", namespaces=NSMAP)[0].text assert words_text == line_text # For extra measure, check that we're not seeing any glyphs, as we asked for textequiv_level == "word" glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) assert len(glyphs) == 0
def test_recognize(workspace): CalamariRecognize(workspace, input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint_dir": CHECKPOINT_DIR, }).process() workspace.save_mets() page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) assertFileContains(page1, "verſchuldeten")
def test_recognize_should_warn_if_given_rgb_image_and_single_channel_model( workspace, caplog): caplog.set_level(logging.WARNING) CalamariRecognize(workspace, input_file_grp="OCR-D-GT-SEG-WORD-GLYPH", output_file_grp="OCR-D-OCR-CALAMARI-BROKEN", parameter={ 'checkpoint_dir': CHECKPOINT_DIR }).process() interesting_log_messages = [ t[2] for t in caplog.record_tuples if "Using raw image" in t[2] ] assert len(interesting_log_messages) > 10 # For every line!
def test_recognize(workspace): CalamariRecognize( workspace, input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint": CHECKPOINT, } ).process() workspace.save_mets() page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) with open(page1, "r", encoding="utf-8") as f: assert "verſchuldeten" in f.read()
def test_glyphs(workspace): CalamariRecognize( workspace, input_file_grp="OCR-D-GT-SEG-LINE", output_file_grp="OCR-D-OCR-CALAMARI", parameter={ "checkpoint": CHECKPOINT, "textequiv_level": "glyph", # Note that we're going down to glyph level here } ).process() workspace.save_mets() page1 = os.path.join(workspace.directory, "OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml") assert os.path.exists(page1) tree = etree.parse(page1) # The result should contain a lot of glyphs glyphs = tree.xpath("//pc:Glyph", namespaces=NSMAP) assert len(glyphs) >= 100