示例#1
0
文件: watcher.py 项目: deisi/OCRmyPDF
def main():
    ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.default,
                               manage_root_logger=True)
    log.info(f"Starting OCRmyPDF watcher with config:\n"
             f"Input Directory: {INPUT_DIRECTORY}\n"
             f"Output Directory: {OUTPUT_DIRECTORY}\n"
             f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}")
    log.debug(f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
              f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
              f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
              f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
              f"DESKEW: {DESKEW}\n"
              f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
              f"LOGLEVEL: {LOGLEVEL}\n")

    handler = HandleObserverEvent(patterns=PATTERNS)
    observer = Observer()
    observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
示例#2
0
def main():
    ocrmypdf.configure_logging(
        verbosity=ocrmypdf.Verbosity.default, manage_root_logger=True
    )
    log.setLevel(LOGLEVEL)
    log.info(
        f"Starting OCRmyPDF watcher with config:\n"
        f"Input Directory: {INPUT_DIRECTORY}\n"
        f"Output Directory: {OUTPUT_DIRECTORY}\n"
        f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
    )
    log.debug(
        f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
        f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
        f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
        f"DESKEW: {DESKEW}\n"
        f"ARGS: {OCR_JSON_SETTINGS}\n"
        f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
        f"USE_POLLING: {USE_POLLING}\n"
        f"LOGLEVEL: {LOGLEVEL}\n"
    )

    if 'input_file' in OCR_JSON_SETTINGS or 'output_file' in OCR_JSON_SETTINGS:
        log.error('OCR_JSON_SETTINGS should not specify input file or output file')
        sys.exit(1)

    handler = HandleObserverEvent(patterns=PATTERNS)
    if USE_POLLING:
        observer = PollingObserver()
    else:
        observer = Observer()
    observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
示例#3
0
def pdf2pdfa(args):
    ok = False

    if args['mime_type'] == 'application/pdf':
        args['tmp_file_path'] = args['source_file_path']

        # WAIT: Legg inn ekstra sjekk her om hva som skal gjøres hvis ocr = True
        if args['version'] in ('1a', '1b', '2a', '2b'):
            file_copy(args)
            if os.path.exists(args['norm_file_path']):
                ok = True

            return ok

    ocrmypdf.configure_logging(-1)
    result = ocrmypdf.ocr(args['tmp_file_path'],
                          args['norm_file_path'],
                          tesseract_timeout=0,
                          progress_bar=False,
                          skip_text=True)
    if str(result) == 'ExitCode.ok':
        ok = True

    return ok
示例#4
0
else:
    start_dir = '.'

if len(sys.argv) > 2:
    log_file = sys.argv[2]
else:
    log_file = script_dir + '/ocr-tree.log'

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(message)s',
    filename=log_file,
    filemode='w',
)

ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

for dir_name, _subdirs, file_list in os.walk(start_dir):
    logging.info(dir_name + '\n')
    os.chdir(dir_name)
    for filename in file_list:
        file_ext = os.path.splitext(filename)[1]
        if file_ext == '.pdf':
            full_path = dir_name + '/' + filename
            print(full_path)
            result = ocrmypdf.ocr(filename, filename, deskew=True)
            if result == ocrmypdf.ExitCode.already_done_ocr:
                print("Skipped document because it already contained text")
            elif result == ocrmypdf.ExitCode.ok:
                print("OCR complete")
            logging.info(result)