Exemplo n.º 1
0
def test_reading_order():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/FILE_0010_OCR-D-OCR-CALAMARI.xml').convert()
    # region_order='document'
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert len(
        tree.xpath('//alto:PrintSpace/alto:TextBlock',
                   namespaces=NAMESPACES)) == 3
    assert tree.xpath('//alto:TextBlock[1]',
                      namespaces=NAMESPACES)[0].get('ID') == 'region_0001'
    assert tree.xpath('//alto:TextBlock[1]/alto:TextLine/alto:String',
                      namespaces=NAMESPACES)[0].get('CONTENT') == 'wird'
    # region_order='reading-order'
    c = OcrdPageAltoConverter(
        region_order='reading-order',
        page_filename='tests/data/FILE_0010_OCR-D-OCR-CALAMARI.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert len(
        tree.xpath('//alto:PrintSpace/alto:TextBlock',
                   namespaces=NAMESPACES)) == 3
    assert tree.xpath('//alto:TextBlock[1]',
                      namespaces=NAMESPACES)[0].get('ID') == 'region_0003'
    # region_order='reading-order-only'
    c = OcrdPageAltoConverter(
        region_order='reading-order-only',
        page_filename='tests/data/FILE_0010_OCR-D-OCR-CALAMARI.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert len(
        tree.xpath('//alto:PrintSpace/alto:TextBlock',
                   namespaces=NAMESPACES)) == 2
    assert tree.xpath('//alto:TextBlock[1]',
                      namespaces=NAMESPACES)[0].get('ID') == 'region_0003'
Exemplo n.º 2
0
def test_convert_processingstep():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/OCR-D-OCR-TESS_00001.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert tree.xpath(
        '//alto:Processing/alto:processingSoftware/alto:softwareName',
        namespaces=NAMESPACES)[0].text == 'ocrd-olena-binarize'
Exemplo n.º 3
0
def test_pararaphstyle():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/align.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert tree.xpath('//alto:ParagraphStyle',
                      namespaces=NAMESPACES)[0].get('ALIGN') == 'Block'
    assert 'parastyle-Block---None---None---None---None' in tree.xpath(
        '//alto:TextBlock', namespaces=NAMESPACES)[0].get('STYLEREFS')
Exemplo n.º 4
0
def test_create_alto():
    c = OcrdPageAltoConverter(
        page_filename=
        'tests/assets/kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml'
    )
    assert str(c).split(
        '\n'
    )[1] == '<alto xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/standards/alto/ns-v4#" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-2.xsd" SCHEMAVERSION="4.2">'
Exemplo n.º 5
0
def test_convert_language():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/language.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert tree.xpath('//*[@ID="r1"]/@LANG', namespaces=NAMESPACES)[0] == 'vol'
    assert tree.xpath('//*[@ID="r1-l1"]/@LANG',
                      namespaces=NAMESPACES)[0] == 'nob'
    assert tree.xpath('//*[@ID="r1-l1-w1"]/@LANG',
                      namespaces=NAMESPACES)[0] == 'epo'
Exemplo n.º 6
0
def test_layouttag():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/layouttag.page.xml').convert()
    print(str(c))
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert [
        x.get('LABEL') for x in tree.xpath('//alto:Tags/alto:LayoutTag',
                                           namespaces=NAMESPACES)
    ] == ['paragraph']
    assert len(tree.xpath('//*[@LABEL="paragraph"]')) == 1
    assert len(tree.xpath(
        '//*[@LABEL="catch-word"]')) == 0  # @TYPE only allowed for BlockType
Exemplo n.º 7
0
def page_to_alto(uid: str, task_result_dir: str):
    """ Convert page files to alto. """
    page_result_path = ocr_result_path(task_result_dir)

    if page_result_path is None:
        logger.info(f"Can't find page results to create alto for task {uid}.")
        return

    alto_path = alto_result_path(task_result_dir)
    for file_path in page_result_path.iterdir():
        converter = OcrdPageAltoConverter(
            check_words=False,
            check_border=False,
            page_filename=file_path,
            region_order="reading-order",
        )
        alto_xml = converter.convert()
        alto_file_name = file_path.name.replace("CALAMARI", "ALTO")
        alto_result_file = alto_path.joinpath(alto_file_name)
        with open(alto_result_file, "w") as alto_file:
            alto_file.write(str(alto_xml))
    logger.info(f"Created alto from page for task {uid}.")
Exemplo n.º 8
0
def test_dummy():
    c = OcrdPageAltoConverter(
        check_border=False,
        dummy_textline=True,
        dummy_word=True,
        page_filename='tests/data/region_no_line.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert len(
        tree.xpath('//alto:TextLine[@ID="r0-dummy-TextLine"]',
                   namespaces=NAMESPACES)) == 1
    assert len(
        tree.xpath('//alto:String[@ID="r0-dummy-TextLine-dummy-Word"]',
                   namespaces=NAMESPACES)) == 1
    assert tree.xpath(
        '//alto:String[@ID="r0-dummy-TextLine-dummy-Word"]',
        namespaces=NAMESPACES)[0].get('CONTENT') == 'CONTENT BUT NO LINES'
Exemplo n.º 9
0
def test_convert_no_words():
    with raises(ValueError, match='Line the-bad-one has.*not words'):
        OcrdPageAltoConverter(
            check_border=False,
            page_filename='tests/data/content-no-words.page.xml')
Exemplo n.º 10
0
def test_convert3():
    c = OcrdPageAltoConverter(
        page_filename='tests/assets/origin/Blumbach/extra_regions.xml')
    c.convert()
Exemplo n.º 11
0
def test_convert1():
    c = OcrdPageAltoConverter(
        page_filename=
        'tests/assets/kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml'
    )
    c.convert()
Exemplo n.º 12
0
def test_empty_init_kwargs():
    with raises(ValueError):
        OcrdPageAltoConverter()
Exemplo n.º 13
0
def test_hyp():
    c = OcrdPageAltoConverter(
        trailing_dash_to_hyp=True,
        page_filename='tests/data/sp-hyp.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert tree.xpath('//alto:HYP', namespaces=NAMESPACES)
Exemplo n.º 14
0
def test_sp():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/sp-hyp.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert len(tree.xpath('//alto:SP', namespaces=NAMESPACES)) == 2
Exemplo n.º 15
0
def test_pageclass():
    c = OcrdPageAltoConverter(
        page_filename='tests/data/blank.page.xml').convert()
    tree = ET.fromstring(str(c).encode('utf-8'))
    assert tree.xpath('//alto:Page',
                      namespaces=NAMESPACES)[0].get('PAGECLASS') == 'blank'