예제 #1
0
def do_document_ocr(queue_document):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling tesseract
    """
    for document_page in queue_document.document.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
            ocr_transformations, warnings = queue_document.get_transformation_list(
            )

            document_filepath = document_page.document.get_image_cache_name(
                page=document_page.page_number,
                version=document_page.document_version.pk)
            unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (
                document_page.document.uuid, document_page.page_number,
                os.extsep, UNPAPER_FILE_FORMAT)
            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY,
                                                   unpaper_output_filename)

            unpaper_input = convert(document_filepath,
                                    file_format=UNPAPER_FILE_FORMAT,
                                    transformations=ocr_transformations)
            execute_unpaper(input_filepath=unpaper_input,
                            output_filepath=unpaper_output_filepath)

            #from PIL import Image, ImageOps
            #im = Image.open(document_filepath)
            ##if im.mode=='RGBA':
            ##    im=im.convert('RGB')
            ##im = im.convert('L')
            #im = ImageOps.grayscale(im)
            #im.save(unpaper_output_filepath)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath,
                                       file_format=DEFAULT_OCR_FILE_FORMAT)
            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join(
                [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = run_tesseract(pre_ocr_filepath_w_ext,
                                         TESSERACT_LANGUAGE)

                document_page.content = ocr_cleanup(ocr_text)
                document_page.page_label = _(u'Text from OCR')
                document_page.save()
            finally:
                cleanup(pre_ocr_filepath_w_ext)
                cleanup(unpaper_input)
                cleanup(document_filepath)
                cleanup(unpaper_output_filepath)
예제 #2
0
def do_document_ocr(document_version):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling the corresponding
    OCR backend
    """
    for document_page in document_version.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR

            document_filepath = document_page.document.get_image_cache_name(
                page=document_page.page_number,
                version=document_page.document_version.pk)

            logger.debug('document_filepath: %s', document_filepath)

            unpaper_input = convert(document_filepath,
                                    file_format=UNPAPER_FILE_FORMAT)

            logger.debug('unpaper_input: %s', unpaper_input)

            unpaper_output = execute_unpaper(input_filepath=unpaper_input)

            logger.debug('unpaper_output: %s', unpaper_output)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output,
                                       file_format=DEFAULT_OCR_FILE_FORMAT)

            logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)

            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join(
                [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])

            logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)

            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = ocr_backend.execute(
                    pre_ocr_filepath_w_ext, document_version.document.language)

                document_page.content = ocr_cleanup(
                    document_version.document.language, ocr_text)
                document_page.page_label = _('Text from OCR')
                document_page.save()
            finally:
                fs_cleanup(pre_ocr_filepath_w_ext)
                fs_cleanup(unpaper_input)
                fs_cleanup(document_filepath)
                fs_cleanup(unpaper_output)
예제 #3
0
파일: api.py 프로젝트: MechanisM/mayan
def do_document_ocr(queue_document):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling tesseract
    """
    for document_page in queue_document.document.documentpage_set.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
            ocr_transformations, warnings = queue_document.get_transformation_list()

            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number)
            unpaper_output_filename = u"%s_unpaper_out_page_%s%s%s" % (
                document_page.document.uuid,
                document_page.page_number,
                os.extsep,
                UNPAPER_FILE_FORMAT,
            )
            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)

            unpaper_input = convert(
                document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations
            )
            execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)

            # from PIL import Image, ImageOps
            # im = Image.open(document_filepath)
            ##if im.mode=='RGBA':
            ##    im=im.convert('RGB')
            ##im = im.convert('L')
            # im = ImageOps.grayscale(im)
            # im.save(unpaper_output_filepath)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)

                document_page.content = ocr_cleanup(ocr_text)
                document_page.page_label = _(u"Text from OCR")
                document_page.save()
            finally:
                cleanup(pre_ocr_filepath_w_ext)
                cleanup(unpaper_input)
                cleanup(document_filepath)
                cleanup(unpaper_output_filepath)
예제 #4
0
def do_document_ocr(document_version):
    """
    Try first to extract text from document pages using the registered
    parser, if the parser fails or if there is no parser registered for
    the document mimetype do a visual OCR by calling the corresponding
    OCR backend
    """
    for document_page in document_version.pages.all():
        try:
            # Try to extract text by means of a parser
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR

            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number, version=document_page.document_version.pk)

            logger.debug('document_filepath: %s', document_filepath)

            unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)

            logger.debug('unpaper_input: %s', unpaper_input)

            unpaper_output = execute_unpaper(input_filepath=unpaper_input)

            logger.debug('unpaper_output: %s', unpaper_output)

            # Convert to TIFF
            pre_ocr_filepath = convert(input_filepath=unpaper_output, file_format=DEFAULT_OCR_FILE_FORMAT)

            logger.debug('pre_ocr_filepath: %s', pre_ocr_filepath)

            # Tesseract needs an explicit file extension
            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])

            logger.debug('pre_ocr_filepath_w_ext: %s', pre_ocr_filepath_w_ext)

            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
                ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)

                document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
                document_page.page_label = _('Text from OCR')
                document_page.save()
            finally:
                fs_cleanup(pre_ocr_filepath_w_ext)
                fs_cleanup(unpaper_input)
                fs_cleanup(document_filepath)
                fs_cleanup(unpaper_output)
예제 #5
0
파일: models.py 프로젝트: fccoelho/mayan
 def get_image_cache_name(self, page):
     cache_file_path, transformations = self.get_cached_image_name(page)
     if os.path.exists(cache_file_path):
         return cache_file_path
     else:
         document_file = document_save_to_temp_dir(self, self.checksum)
         return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
예제 #6
0
파일: staging.py 프로젝트: fccoelho/mayan
 def get_image(self, size, transformations):
     try:
         return convert(self.filepath, size=size, cleanup_files=False, transformations=transformations)
     except UnknownFileFormat:
         return get_icon_file_path(get_mimetype(self.filepath))
     except UnkownConvertError:
         return get_error_icon_file_path()
예제 #7
0
    def get_valid_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, version=None):
        if not version:
            version = self.latest_version.pk
        image_cache_name = self.get_image_cache_name(page=page, version=version)

        logger.debug('image_cache_name: %s' % image_cache_name)

        return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
예제 #8
0
 def get_image_cache_name(self, page, version):
     cache_file_path, transformations = self.get_cached_image_name(page, version)
     if os.path.exists(cache_file_path):
         return cache_file_path
     else:
         document_version = DocumentVersion.objects.get(pk=version)
         document_file = document_save_to_temp_dir(document_version, document_version.checksum)
         return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)
예제 #9
0
파일: models.py 프로젝트: AmiGanguli/mayan
 def get_image_cache_name(self, page, version):
     cache_file_path, transformations = self.get_cached_image_name(page, version)
     if os.path.exists(cache_file_path):
         return cache_file_path
     else:
         document_version = DocumentVersion.objects.get(pk=version)
         document_file = document_save_to_temp_dir(document_version, document_version.checksum)
         return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)
예제 #10
0
파일: models.py 프로젝트: fccoelho/mayan
 def get_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION):
     try:
         image_cache_name = self.get_image_cache_name(page=page)
         return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
     except UnknownFileFormat:
         return get_icon_file_path(self.file_mimetype)
     except UnkownConvertError:
         return get_error_icon_file_path()
     except:
         return get_error_icon_file_path()
예제 #11
0
    def get_image(self, size, page, zoom, rotation, as_base64=True):
        # TODO: add support for transformations
        converted_file_path = convert(self.get_full_path(), size=size)

        if as_base64:
            mimetype = get_mimetype(open(converted_file_path, 'r'), converted_file_path, mimetype_only=True)[0]
            image = open(converted_file_path, 'r')
            base64_data = base64.b64encode(image.read())
            image.close()
            return u'data:%s;base64,%s' % (mimetype, base64_data)
        else:
            return converted_file_path
예제 #12
0
파일: views.py 프로젝트: strogo/mayan
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
    check_permissions(request.user, "documents", [PERMISSION_DOCUMENT_VIEW])

    document = get_object_or_404(Document, pk=document_id)

    page = int(request.GET.get("page", 1))
    transformation_list = []
    try:
        # Catch invalid or non existing pages
        document_page = DocumentPage.objects.get(document=document, page_number=page)
        for page_transformation in document_page.documentpagetransformation_set.all():
            try:
                if page_transformation.transformation in TRANFORMATION_CHOICES:
                    output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(
                        page_transformation.arguments
                    )
                    transformation_list.append(output)
            except Exception, e:
                if request.user.is_staff:
                    messages.warning(
                        request,
                        _(u"Error for transformation %(transformation)s:, %(error)s")
                        % {"transformation": page_transformation.get_transformation_display(), "error": e},
                    )
                else:
                    pass
    except ObjectDoesNotExist:
        pass

    tranformation_string = " ".join(transformation_list)
    try:
        filepath = in_image_cache(
            document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page - 1
        )
        if filepath:
            return serve_file(request, File(file=open(filepath, "r")), content_type="image/jpeg")
        # Save to a temporary location
        filepath = document_save_to_temp_dir(document, filename=document.checksum)
        output_file = convert(
            filepath, size=size, format="jpg", quality=quality, extra_options=tranformation_string, page=page - 1
        )
        return serve_file(request, File(file=open(output_file, "r")), content_type="image/jpeg")
    except UnkownConvertError, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        if size == THUMBNAIL_SIZE:
            return serve_file(request, File(file=open("%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_SMALL), "r")))
        else:
            return serve_file(
                request, File(file=open("%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r"))
            )
예제 #13
0
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
    check_permissions(request.user, 'documents', [PERMISSION_DOCUMENT_VIEW])
        
    document = get_object_or_404(Document, pk=document_id)

    page = int(request.GET.get('page', 1))
    transformation_list = []
    try:
        #Catch invalid or non existing pages
        document_page = DocumentPage.objects.get(document=document, page_number=page)
        for page_transformation in document_page.documentpagetransformation_set.all():
            try:
                if page_transformation.transformation in TRANFORMATION_CHOICES:
                    output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments)
                    transformation_list.append(output)
            except Exception, e:
                if request.user.is_staff:
                    messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % 
                        {'transformation':page_transformation.get_transformation_display(),
                        'error':e})
                else:
                    pass
    except ObjectDoesNotExist:
        pass
    
    tranformation_string = ' '.join(transformation_list)
    try:
        filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page-1)
        if filepath:
            return serve_file(request, File(file=open(filepath, 'r')), content_type='image/jpeg')
        #Save to a temporary location
        filepath = document_save_to_temp_dir(document, filename=document.checksum)
        output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string, page=page-1)
        return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg')
    except UnkownConvertError, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        if size == THUMBNAIL_SIZE:
            return serve_file(request, File(file=open('%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_SMALL), 'r')))
        else:
            return serve_file(request, File(file=open('%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r')))
예제 #14
0
파일: views.py 프로젝트: coulix/mayan
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
    check_permissions(request.user, 'documents', [PERMISSION_DOCUMENT_VIEW])
        
    document = get_object_or_404(Document, pk=document_id)

    page = int(request.GET.get('page', 1))
    transformation_list = []
    try:
        #Catch invalid or non existing pages
        document_page = DocumentPage.objects.get(document=document, page_number=page)
        for page_transformation in document_page.documentpagetransformation_set.all():
            try:
                if page_transformation.transformation in TRANFORMATION_CHOICES:
                    output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments)
                    transformation_list.append(output)
            except Exception, e:
                if request.user.is_staff:
                    messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % 
                        {'transformation':page_transformation.get_transformation_display(),
                        'error':e})
                else:
                    pass
    except ObjectDoesNotExist:
        pass

    tranformation_string = ' '.join(transformation_list)
    try:
        filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page-1)
        if filepath:
            return serve_file(request, File(file=open(filepath, 'r')), content_type='image/jpeg')
        #Save to a temporary location
        filepath = document_save_to_temp_dir(document, filename=document.checksum)
        output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string, page=page-1)
        return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg')
    except UnkownConvertError, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        if size == THUMBNAIL_SIZE:
            return serve_file(request, File(file=open('%simages/picture_error.png' % settings.MEDIA_ROOT, 'r')))
        else:
            return serve_file(request, File(file=open('%simages/1297211435_error.png' % settings.MEDIA_ROOT, 'r')))
예제 #15
0
파일: views.py 프로젝트: strogo/mayan
                transformation_list.append(output)
        except Exception, e:
            if request.user.is_staff:
                messages.warning(
                    request,
                    _(u"Error for transformation %(transformation)s:, %(error)s")
                    % {"transformation": page_transformation.get_transformation_display(), "error": e},
                )
            else:
                pass
    tranformation_string = " ".join(transformation_list)

    try:
        filepath = StagingFile.get(staging_file_id).filepath
        output_file = convert(
            filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False
        )
        return serve_file(request, File(file=open(output_file, "r")), content_type="image/jpeg")
    except UnkownConvertError, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r")))
    except UnknownFormat:
        return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_UNKNOWN_MEDIUM), "r")))
    except Exception, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        return serve_file(request, File(file=open(u"%simages/%s" % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), "r")))


# TODO: Need permission
예제 #16
0
        try:
            if page_transformation['name'] in TRANFORMATION_CHOICES:
                output = TRANFORMATION_CHOICES[page_transformation['name']] % eval(page_transformation['arguments'])
                transformation_list.append(output)
        except Exception, e:
            if request.user.is_staff:
                messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % 
                    {'transformation':page_transformation.get_transformation_display(),
                    'error':e})
            else:
                pass
    tranformation_string = ' '.join(transformation_list)

    try:
        filepath = StagingFile.get(staging_file_id).filepath
        output_file = convert(filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False)
        return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg')
    except UnkownConvertError, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r')))
    except UnknownFormat:
        return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_UNKNOWN_MEDIUM), 'r')))
    except Exception, e:
        if request.user.is_staff or request.user.is_superuser:
            messages.error(request, e)
        return serve_file(request, File(file=open(u'%simages/%s' % (settings.MEDIA_ROOT, PICTURE_ERROR_MEDIUM), 'r')))   


#TODO: Need permission     
def staging_file_delete(request, staging_file_id):
예제 #17
0
파일: staging.py 프로젝트: IHLeanne/mayan
 def get_valid_image(self, size=THUMBNAIL_SIZE, transformations=None):
     return convert(self.filepath, size=size, cleanup_files=False, transformations=transformations)
예제 #18
0
 def get_valid_image(self, size=THUMBNAIL_SIZE, transformations=None):
     return convert(self.filepath,
                    size=size,
                    cleanup_files=False,
                    transformations=transformations)
예제 #19
0
파일: models.py 프로젝트: MechanisM/mayan
 def get_valid_image(self, size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION):
     image_cache_name = self.get_image_cache_name(page=page)
     return convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
예제 #20
0
파일: staging.py 프로젝트: mrcrabby/mayan
 def preview(self):
     tranformation_string, errors = get_transformation_string(DEFAULT_TRANSFORMATIONS)
     output_file = convert(self.filepath, size=STAGING_FILES_PREVIEW_SIZE, extra_options=tranformation_string, cleanup_files=False)
     return output_file, errors