예제 #1
0
파일: tasks.py 프로젝트: ddohler/webocr
def document_analysis(docid):
    #TODO: Check for multiple objects?
    doc = util.is_valid_doc(docid)
    
    doc.file_format = util.determine_format(doc)
    ### Counting pages and repairing damaged documents ###
    num_pages = util.count_pages(doc)
    #TODO: The repair command doesn't quite work; need to make a copy first
    # or update the object's field.
    #if num_pages == -1 and doc.file_format == 'pdf':
        # Try to repair damaged PDF
    #    cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file]
    #    try:
    #        subprocess.check_call(cmd)
    #    except subprocess.CalledProcessError as e:
    #        print(e)
            #TODO: More error handling if necessary

        #Try again
    #    num_pages = util.count_pages(doc)
        #If it's still undetectable there's not much more we can do
        #TODO: Report error, image cannot be processed.

    if doc.file_format == 'pdf':
        #Counting the number of pages may fail; PyPdf doesn't handle corrupt
        #PDFs well.
        num_imgs = util.count_images(doc)
        has_text = util.detect_text(doc)
    else:
        num_imgs = num_pages #For TIFFS num_pages might be >1
        has_text = False

    # Decide what to do
    if has_text == False and num_imgs == num_pages: #Simple case
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_images.delay(docid)
    elif has_text == True and num_imgs == 0: #Nothing to OCR
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #Rasterize and output page images
    elif has_text == True and num_imgs > 0: #Mixed image / text
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #For now, rasterize pages, then OCR
    else: #Fallback to rasterization
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #rasterize and OCR

    doc.num_pages = num_pages
    doc.save()
예제 #2
0
def document_analysis(docid):
    #TODO: Check for multiple objects?
    doc = util.is_valid_doc(docid)

    doc.file_format = util.determine_format(doc)
    ### Counting pages and repairing damaged documents ###
    num_pages = util.count_pages(doc)
    #TODO: The repair command doesn't quite work; need to make a copy first
    # or update the object's field.
    #if num_pages == -1 and doc.file_format == 'pdf':
    # Try to repair damaged PDF
    #    cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file]
    #    try:
    #        subprocess.check_call(cmd)
    #    except subprocess.CalledProcessError as e:
    #        print(e)
    #TODO: More error handling if necessary

    #Try again
    #    num_pages = util.count_pages(doc)
    #If it's still undetectable there's not much more we can do
    #TODO: Report error, image cannot be processed.

    if doc.file_format == 'pdf':
        #Counting the number of pages may fail; PyPdf doesn't handle corrupt
        #PDFs well.
        num_imgs = util.count_images(doc)
        has_text = util.detect_text(doc)
    else:
        num_imgs = num_pages  #For TIFFS num_pages might be >1
        has_text = False

    # Decide what to do
    if has_text == False and num_imgs == num_pages:  #Simple case
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_images.delay(docid)
    elif has_text == True and num_imgs == 0:  #Nothing to OCR
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #Rasterize and output page images
    elif has_text == True and num_imgs > 0:  #Mixed image / text
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #For now, rasterize pages, then OCR
    else:  #Fallback to rasterization
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #rasterize and OCR

    doc.num_pages = num_pages
    doc.save()
예제 #3
0
파일: tasks.py 프로젝트: ddohler/webocr
def pages_from_rasterize(docid):
    """Rasterizes PDF pages, then continues with recognition."""
    doc = util.is_valid_doc(docid)
    print "Rasterizing pages..."
    page_files = util.rasterize_pdf(doc)

    if doc.num_pages != len(page_files): #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()

    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                files_prefix=page_files[i][0],
                stage_output_extension=page_files[i][1],
                page_number=i,
                start_process_date=datetime.now(),
                status='w')
        doc_page.save()
    
        #Docs already guaranteed converted, move to binarization.
        binarize_page.delay(doc_page)
예제 #4
0
def pages_from_rasterize(docid):
    """Rasterizes PDF pages, then continues with recognition."""
    doc = util.is_valid_doc(docid)
    print "Rasterizing pages..."
    page_files = util.rasterize_pdf(doc)

    if doc.num_pages != len(page_files):  #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()

    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                                files_prefix=page_files[i][0],
                                stage_output_extension=page_files[i][1],
                                page_number=i,
                                start_process_date=datetime.now(),
                                status='w')
        doc_page.save()

        #Docs already guaranteed converted, move to binarization.
        binarize_page.delay(doc_page)
예제 #5
0
파일: tasks.py 프로젝트: ddohler/webocr
def pages_from_images(docid):
    doc = util.is_valid_doc(docid)
    print "Constructing pages from images..."
    #TODO: Consider splitting to multi-page TIFF so tesseract can learn
    
    page_files = util.split_to_files(doc)
    
    if doc.num_pages != len(page_files): #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()
    # Creates DocumentPages for each file returned by
    # split function, then launches conversion, etc.
    # tasks for each DocumentPage.
    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                files_prefix=page_files[i][0],
                stage_output_extension=page_files[i][1],
                page_number=i,
                start_process_date=datetime.now(),
                status='w')
        doc_page.save()

        convert_page.delay(doc_page)
예제 #6
0
def pages_from_images(docid):
    doc = util.is_valid_doc(docid)
    print "Constructing pages from images..."
    #TODO: Consider splitting to multi-page TIFF so tesseract can learn

    page_files = util.split_to_files(doc)

    if doc.num_pages != len(page_files):  #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()
    # Creates DocumentPages for each file returned by
    # split function, then launches conversion, etc.
    # tasks for each DocumentPage.
    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                                files_prefix=page_files[i][0],
                                stage_output_extension=page_files[i][1],
                                page_number=i,
                                start_process_date=datetime.now(),
                                status='w')
        doc_page.save()

        convert_page.delay(doc_page)