def test_extract_data(): from lembrar.recognize import recognize filedata = resource_stream(__name__, 'test.jpg') lang, img, text = recognize(filedata.read(), ['en'], True) assert lang == 'en' assert isinstance(img, Image) assert text == u'This is a test\n\nMore text\n\n'
def handle_update( db, id, accepted_languages, version, ): doc = db.find_one(id) data = doc.raw_data lang, ignore, searchterms = recognize(data, accepted_languages, False) doc.update_plugin_and_canonical_attr('language', lang) doc.update_plugin('ocr', searchterms) doc.register_searchable_field('ocr') doc.finish_parsing(version) doc.reindex()
def add(request): title = request.params['title'] created = request.params.get('created', datetime.utcnow()) or \ datetime.utcnow() description = request.params.get('description', '') force_detection = request.params.get("force_detection", 'true').lower() == 'true' accepted_languages = request.registry.settings['accepted_languages'] imgstream = request.params['file'].file doc_list = request.db.docs for image in get_images_from_stream(imgstream): try: lang, img, text = recognize(image, accepted_languages, force_detection) except TypeError, e: err_msg = "Error: " + str(e) request.session.flash(err_msg, 'failure') return HTTPServerError(explanation=err_msg, detail='Go back, unselect force '\ 'detection, and try again') text += " " + description + " " + title if text: search_terms = list(index(text, [lang])) else: search_terms = '' thumb = get_thumbnail(image) doc_list.insert({ 'img': Binary(image), 'thumb': Binary(thumb), 'created': created, 'version': 4, 'forced_detection': force_detection, 'language': lang, 'keywords': [], 'search_terms': search_terms, 'title': title })
def add(request): title = request.params['title'] created = request.params.get('created', datetime.utcnow()) or \ datetime.utcnow() description = request.params.get('description', '') force_detection = request.params.get("force_detection", 'true').lower() == 'true' accepted_languages = request.registry.settings['accepted_languages'] imgstream = request.params['file'].file doc_list = request.db.docs for image in get_images_from_stream(imgstream): try: lang, img, text = recognize(image, accepted_languages, force_detection) except TypeError, e: err_msg = "Error: " + str(e) request.session.flash(err_msg, 'failure') return HTTPServerError(explanation=err_msg, detail='Go back, unselect force '\ 'detection, and try again') text += " " + description + " " + title if text: search_terms = list(index(text, [lang])) else: search_terms = '' thumb = get_thumbnail(image) doc_list.insert({'img': Binary(image), 'thumb': Binary(thumb), 'created': created, 'version': 4, 'forced_detection': force_detection, 'language': lang, 'keywords': [], 'search_terms': search_terms, 'title': title})