Exemplo n.º 1
0
def main():
    starttime = datetime.now()
    conf = util.get_config('DEFAULT')
    sym_wordcorrect(conf)

    endtime = datetime.now()
    elapsed = endtime - starttime
    print(f"Start: {starttime.strftime('%H:%M:%S')}")
    print(f"End:   {endtime.strftime('%H:%M:%S')}")
    print(f"Elapsed: {elapsed}")
def main():
    """Run the OCR pipeline."""
    starttime = datetime.now()
    conf = util.get_config()
    cnf = util.Confs(conf)

    do_ocr(cnf.tessconf['imgdir'],
           cnf.tessconf['outdir'],
           traineddata_labels=['Fraktur', 'dan', 'frk'])

    endtime = datetime.now()
    elapsed = endtime - starttime
    print(f"Start: {starttime.strftime('%H:%M:%S')}")
    print(f"End:   {endtime.strftime('%H:%M:%S')}")
    print(f"Elapsed: {elapsed}")
Exemplo n.º 3
0
# Core libraries
import configparser
import os
from datetime import datetime
# Image processing
try:
    from PIL import Image
except ImportError:
    import Image
import myutils as util
# try to speed up!
import multiprocessing as mp
import pytesseract
from itertools import product

conf = util.get_config()
pth = util.Confs(conf)

pytesseract.pytesseract.tesseract_cmd = pth.tessconf['tess_bin']
tessdata_dir_config = fr'''--tessdata-dir "{pth.tessconf['tessdata_dir']}"'''


def process(arg_tuple):
    """OCR process."""
    path, outdir, traineddata_label = arg_tuple
    # for image in [sorted(os.listdir(path))[0]]:  # Use this to only OCR first page in novel/sample.
    for image in os.listdir(path):
        # Create image path with join
        imagepath = os.path.join(path, image)
        print(f'Working on {imagepath}')
        # Convert image to text
Exemplo n.º 4
0
def main():
    """Run the OCR pipeline."""
    starttime = datetime.now()
    config = util.get_config()

    # Generate various paths and create them if necessary.
    conf = util.Confs(config).corrconf
    *_, param_str = util.get_params(conf)
    pth = util.CorrPaths(conf)

    # Which OCR traineddata should be used?
    # Note! frk.traineddata must be downloaded from tessdata_fast in order to work:
    # https://github.com/tesseract-ocr/tessdata_fast/blob/master/frk.traineddata
    # Same for dan.traineddata: https://github.com/tesseract-ocr/tessdata_fast/blob/master/dan.traineddata
    # fraktur.traineddata can be downloaded from tessdata_best:
    # https://github.com/tesseract-ocr/tessdata_best/blob/master/script/Fraktur.traineddata
    traineddata_labels = ['Fraktur', 'dan', 'frk']
    tess_outdirs = [
        os.path.join(pth.fulloutputdir, f'tess_out_{label}')
        for label in traineddata_labels
    ]
    uncorrected_dir = os.path.join(pth.fulloutputdir, conf['base_ocr'])
    corrected_dir = os.path.join(pth.fulloutputdir, param_str)

    # Steps of the pipeline. Set options in the config file for which processing steps to perform.
    if conf.getboolean('run_make_dictionary'):
        make_dic(conf['metadir'])
    if conf.getboolean('run_pdf2img'):
        pdfs2imgs(pth.frakturpaths, pth.img_dir, int(conf['split_size']))
    if conf.getboolean('run_ocr'):
        do_ocr(pth.img_dir, pth.fulloutputdir, traineddata_labels)
    if conf.getboolean('correct_easy'):
        correct_easy_fraktur_errors(uncorrected_dir, corrected_dir)
        uncorrected_dir = corrected_dir
    if conf.getboolean('correct_hard'):
        correct_hard_fraktur_errors(uncorrected_dir, pth.fulloutputdir,
                                    corrected_dir)
        uncorrected_dir = corrected_dir
    if conf.getboolean('sym_wordcorrect'):
        sym_wordcorrect(conf, uncorrected_dir, corrected_dir)
    # TODO Will it make any sense to employ SymSpell at the bigram level? Probably not?
    # if conf.getboolean('make_basic_gold_vrt'):
    #     gold_vrt_gen = generate_novels_vrt(corrpaths.gold_novels_dir, corrpaths.corp_label)
    #     write_novels_vrt(gold_vrt_gen, corrpaths.basic_gold_vrt_path)
    # if conf.getboolean('annotate_gold_vrt'):
    #     text_annotation_generator = generate_gold_annotations(corrpaths.basic_gold_vrt_path, corrpaths.ocr_kb_dir,
    #                                                           conf['texton_out_dir'], corrpaths.corp_label, tess_outdirs,
    #                                                           [corrected_dir], conf)  # TODO single dir instead of list of dirs?
    #     write_annotated_gold_vrt(text_annotation_generator, corrpaths.local_annotated_gold_vrt_path)
    #     shutil.copy(corrpaths.local_annotated_gold_vrt_path, corrpaths.annotated_gold_vrt_path)
    # if conf.getboolean('analyze_errors'):
    #     # TODO Not very transparent error when n_datasets is wrong.
    #     analyze_gold_vrt(corrpaths.annotated_gold_vrt_path, conf, corrpaths.analyses_dir, param_str, n_datasets=5)
    # if conf.getboolean('write_korp_configs'):
    #     util.write_frakturgold_mode(conf['frakturgold_mode_template'],
    #                                 conf['gold_vrt_p_attrs'],
    #                                 conf['frakturgold_mode_outpath'])
    #     shutil.copy(conf['frakturgold_mode_outpath'], os.path.join(corrpaths.vrt_dir, 'memo_frakturgold_mode.js'))
    #     util.write_frakturgold_encodescript(conf['frakturgold_encode_template'],
    #                                         corrpaths.annotated_outdir,
    #                                         conf['gold_vrt_p_attrs'],
    #                                         conf['frakturgold_encode_outpath'])
    #     shutil.copy(conf['frakturgold_encode_outpath'], os.path.join(corrpaths.vrt_dir, 'encode_MEMO_fraktur_gold.sh'))
    # if conf.getboolean('write_word'):
    #     pass

    endtime = datetime.now()
    elapsed = endtime - starttime
    print(f"Start: {starttime.strftime('%H:%M:%S')}")
    print(f"End:   {endtime.strftime('%H:%M:%S')}")
    print(f"Elapsed: {elapsed}")