예제 #1
0
파일: tasks.py 프로젝트: ddohler/webocr
def document_analysis(docid):
    #TODO: Check for multiple objects?
    doc = util.is_valid_doc(docid)
    
    doc.file_format = util.determine_format(doc)
    ### Counting pages and repairing damaged documents ###
    num_pages = util.count_pages(doc)
    #TODO: The repair command doesn't quite work; need to make a copy first
    # or update the object's field.
    #if num_pages == -1 and doc.file_format == 'pdf':
        # Try to repair damaged PDF
    #    cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file]
    #    try:
    #        subprocess.check_call(cmd)
    #    except subprocess.CalledProcessError as e:
    #        print(e)
            #TODO: More error handling if necessary

        #Try again
    #    num_pages = util.count_pages(doc)
        #If it's still undetectable there's not much more we can do
        #TODO: Report error, image cannot be processed.

    if doc.file_format == 'pdf':
        #Counting the number of pages may fail; PyPdf doesn't handle corrupt
        #PDFs well.
        num_imgs = util.count_images(doc)
        has_text = util.detect_text(doc)
    else:
        num_imgs = num_pages #For TIFFS num_pages might be >1
        has_text = False

    # Decide what to do
    if has_text == False and num_imgs == num_pages: #Simple case
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_images.delay(docid)
    elif has_text == True and num_imgs == 0: #Nothing to OCR
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #Rasterize and output page images
    elif has_text == True and num_imgs > 0: #Mixed image / text
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #For now, rasterize pages, then OCR
    else: #Fallback to rasterization
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid) #rasterize and OCR

    doc.num_pages = num_pages
    doc.save()
예제 #2
0
def document_analysis(docid):
    #TODO: Check for multiple objects?
    doc = util.is_valid_doc(docid)

    doc.file_format = util.determine_format(doc)
    ### Counting pages and repairing damaged documents ###
    num_pages = util.count_pages(doc)
    #TODO: The repair command doesn't quite work; need to make a copy first
    # or update the object's field.
    #if num_pages == -1 and doc.file_format == 'pdf':
    # Try to repair damaged PDF
    #    cmd = ['pdftk', MEDIA_ROOT+doc.doc_file, 'output', MEDIA_ROOT+doc.doc_file]
    #    try:
    #        subprocess.check_call(cmd)
    #    except subprocess.CalledProcessError as e:
    #        print(e)
    #TODO: More error handling if necessary

    #Try again
    #    num_pages = util.count_pages(doc)
    #If it's still undetectable there's not much more we can do
    #TODO: Report error, image cannot be processed.

    if doc.file_format == 'pdf':
        #Counting the number of pages may fail; PyPdf doesn't handle corrupt
        #PDFs well.
        num_imgs = util.count_images(doc)
        has_text = util.detect_text(doc)
    else:
        num_imgs = num_pages  #For TIFFS num_pages might be >1
        has_text = False

    # Decide what to do
    if has_text == False and num_imgs == num_pages:  #Simple case
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_images.delay(docid)
    elif has_text == True and num_imgs == 0:  #Nothing to OCR
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #Rasterize and output page images
    elif has_text == True and num_imgs > 0:  #Mixed image / text
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #For now, rasterize pages, then OCR
    else:  #Fallback to rasterization
        #print "Pages: %d, Images: %d, Text: %d" %(num_pages,num_imgs,has_text)
        pages_from_rasterize.delay(docid)  #rasterize and OCR

    doc.num_pages = num_pages
    doc.save()
예제 #3
0
파일: comment.py 프로젝트: synee/abillist
def count_pages():
    return util.count_pages(db.Query(Comment).count())
예제 #4
0
def count_pages():
    return util.count_pages(db.Query(Comment).count())
예제 #5
0
파일: post.py 프로젝트: neuront/nijipress
def count_pages_by_tag(t):
    return util.count_pages(db.Query(tag.TagPostR).filter('tag =', t).count())
예제 #6
0
파일: post.py 프로젝트: neuront/nijipress
def count_pages():
    return util.count_pages(db.Query(Post).count())
예제 #7
0
파일: post.py 프로젝트: neuront/nijinote
def count_pages_by_tag(t):
    return util.count_pages(db.Query(tag.TagPostR).filter('tag =', t).count())
예제 #8
0
파일: post.py 프로젝트: neuront/nijinote
def count_pages():
    return util.count_pages(db.Query(Post).count())