示例#1
0
def ocr(args):
    """Performs optical character recognition (OCR) on given image. The image
    can be defined either as path to a file or as the primary key of a Page
    object. The latter case also enables saving the resulting text to the
    object. By default the result is simply printed.
    """
    try:
        assert args.pk or args.path, '--path or --pk must be provided'
        assert not (args.pk and args.path), 'specify either --path or --pk'
        assert not (args.path and args.save), 'use --save only with --pk'
    except AssertionError as e:
        raise CommandError(e)

    if args.pk:
        db = app.get_feature('document_storage').default_db
        page = Page.object(db, args.pk)
        path = page.image.full_path
    else:
        path = args.path

    text = image_to_text(path, language=args.language)

    if args.save:
        page.details = text
        page.save()
        yield 'Page saved with the new text.'
    else:
        yield text
示例#2
0
文件: views.py 项目: neithere/orgtool
def page_index(request):
    db = app.get_feature('document_storage').default_db
    pages = Page.objects(db).order_by('date_time')
    for p in pages:
        print p.pk, p, p.summary, p.image
    return {'pages': pages,
            'thumbnail': _get_thumbnail}
示例#3
0
def add_pages(args):
    _args_to_unicode(args, ['language', 'summary', 'summary_prefix'])
    db = app.get_feature('document_storage').default_db

    # check if the files exist
    for path in args.paths:
        assert os.path.exists(path)

    # import
    for path in args.paths:
        yield '* importing {0} (language {1})'.format(path, args.language)

        fingerprint = get_file_hash(open(path, 'rb'))

        # check file hash uniqueness
        if Page.objects(db).where(source_fingerprint=fingerprint):
            yield '...already in the database.'
            continue

        p = Page()

        p.summary = args.summary or get_summary_from_path(path)
        if args.summary_prefix:
            p.summary = u'{0} {1}'.format(args.summary_prefix, p.summary)
        p.language = args.language or None
        p.source_fingerprint = fingerprint
        if not args.no_ocr:
            try:
                p.details = image_to_text(path=path, language=p.language)
            except RuntimeError as e:
                if not args.skip_ocr_errors:
                    raise CommandError(e)
                yield '(OCR failed, saving only image itself)'

        # usually we don't need heavy formats like ppm or tiff even for OCR
        img = Image.open(path)
        if args.format:
            fmt = args.format
        elif img.format not in IMAGE_FORMATS:
            fmt = IMAGE_FORMATS[0]
        else:
            fmt = img.format
        img.save(TMP_FILENAME, fmt)
        p['image'] = open(TMP_FILENAME, 'rb')
        # provide original path so that the resulting filename is inherited
        p['image'].path = path

        p.save(db)