def get_ocr_language(language): langs = get_available_languages() if language is None: language = get_language() ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in langs: lang = language else: if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] raise Exception( "could not get OCR language for language " + str(language) + "; using language " + str(lang)) except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] raise Exception("could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error)) return lang
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, user=None, pdf=False, preserve_color=False, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if isinstance(image_file, set): return [] if not isinstance(image_file, (DAFile, DAFileList, list)): return word("(Not a DAFile, DAFileList, or list object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = {} if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = [] for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif', 'docx', 'doc', 'odt', 'rtf']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in range(safe_pypdf_reader(doc.path()).getNumPages()): todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) elif doc.extension in ("docx", "doc", "odt", "rtf"): doc_conv = docassemble.base.util.pdf_concatenate(doc) for i in range(safe_pypdf_reader(doc_conv.path()).getNumPages()): todo.append(dict(doc=doc_conv, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) else: todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) #sys.stderr.write("ocr_page_tasks finished\n") return todo
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if not (isinstance(image_file, DAFile) or isinstance(image_file, DAFileList)): return word("(Not a DAFile or DAFileList object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = list() for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in xrange(PdfFileReader(open(doc.path(), 'rb')).getNumPages()): todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) else: todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) #sys.stderr.write("ocr_page_tasks finished\n") return todo
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if not (isinstance(image_file, DAFile) or isinstance(image_file, DAFileList)): return word("(Not a DAFile or DAFileList object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write( "ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write( "ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = list() for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in range( PdfFileReader(open(doc.path(), 'rb')).getNumPages()): todo.append( dict(doc=doc, page=i + 1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) else: todo.append( dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) #sys.stderr.write("ocr_page_tasks finished\n") return todo