def ocr(args): """Performs optical character recognition (OCR) on given image. The image can be defined either as path to a file or as the primary key of a Page object. The latter case also enables saving the resulting text to the object. By default the result is simply printed. """ try: assert args.pk or args.path, '--path or --pk must be provided' assert not (args.pk and args.path), 'specify either --path or --pk' assert not (args.path and args.save), 'use --save only with --pk' except AssertionError as e: raise CommandError(e) if args.pk: db = app.get_feature('document_storage').default_db page = Page.object(db, args.pk) path = page.image.full_path else: path = args.path text = image_to_text(path, language=args.language) if args.save: page.details = text page.save() yield 'Page saved with the new text.' else: yield text
def page_index(request): db = app.get_feature('document_storage').default_db pages = Page.objects(db).order_by('date_time') for p in pages: print p.pk, p, p.summary, p.image return {'pages': pages, 'thumbnail': _get_thumbnail}
def add_pages(args): _args_to_unicode(args, ['language', 'summary', 'summary_prefix']) db = app.get_feature('document_storage').default_db # check if the files exist for path in args.paths: assert os.path.exists(path) # import for path in args.paths: yield '* importing {0} (language {1})'.format(path, args.language) fingerprint = get_file_hash(open(path, 'rb')) # check file hash uniqueness if Page.objects(db).where(source_fingerprint=fingerprint): yield '...already in the database.' continue p = Page() p.summary = args.summary or get_summary_from_path(path) if args.summary_prefix: p.summary = u'{0} {1}'.format(args.summary_prefix, p.summary) p.language = args.language or None p.source_fingerprint = fingerprint if not args.no_ocr: try: p.details = image_to_text(path=path, language=p.language) except RuntimeError as e: if not args.skip_ocr_errors: raise CommandError(e) yield '(OCR failed, saving only image itself)' # usually we don't need heavy formats like ppm or tiff even for OCR img = Image.open(path) if args.format: fmt = args.format elif img.format not in IMAGE_FORMATS: fmt = IMAGE_FORMATS[0] else: fmt = img.format img.save(TMP_FILENAME, fmt) p['image'] = open(TMP_FILENAME, 'rb') # provide original path so that the resulting filename is inherited p['image'].path = path p.save(db)