示例#1
0
def test_extract_data():
    from lembrar.recognize import recognize
    filedata = resource_stream(__name__, 'test.jpg')
    lang, img, text = recognize(filedata.read(), ['en'], True)
    assert lang == 'en'
    assert isinstance(img, Image)
    assert text == u'This is a test\n\nMore text\n\n'
示例#2
0
def test_extract_data():
    from lembrar.recognize import recognize
    filedata = resource_stream(__name__, 'test.jpg')
    lang, img, text = recognize(filedata.read(), ['en'], True)
    assert lang == 'en'
    assert isinstance(img, Image)
    assert text == u'This is a test\n\nMore text\n\n'
示例#3
0
def handle_update(
    db,
    id,
    accepted_languages,
    version,
    ):

    doc = db.find_one(id)
    data = doc.raw_data

    lang, ignore, searchterms = recognize(data, accepted_languages, False)
    doc.update_plugin_and_canonical_attr('language', lang)
    doc.update_plugin('ocr', searchterms)
    doc.register_searchable_field('ocr')
    doc.finish_parsing(version)
    doc.reindex()
示例#4
0
def add(request):
    title = request.params['title']
    created = request.params.get('created', datetime.utcnow()) or \
        datetime.utcnow()
    description = request.params.get('description', '')
    force_detection = request.params.get("force_detection",
                                         'true').lower() == 'true'
    accepted_languages = request.registry.settings['accepted_languages']

    imgstream = request.params['file'].file

    doc_list = request.db.docs

    for image in get_images_from_stream(imgstream):
        try:
            lang, img, text = recognize(image, accepted_languages,
                                        force_detection)
        except TypeError, e:
            err_msg = "Error: " + str(e)
            request.session.flash(err_msg, 'failure')
            return HTTPServerError(explanation=err_msg,
                                   detail='Go back, unselect force '\
                                       'detection, and try again')
        text += " " + description + " " + title
        if text:
            search_terms = list(index(text, [lang]))
        else:
            search_terms = ''
        thumb = get_thumbnail(image)

        doc_list.insert({
            'img': Binary(image),
            'thumb': Binary(thumb),
            'created': created,
            'version': 4,
            'forced_detection': force_detection,
            'language': lang,
            'keywords': [],
            'search_terms': search_terms,
            'title': title
        })
示例#5
0
文件: add.py 项目: do3cc/Scanned-Docs
def add(request):
    title = request.params['title']
    created = request.params.get('created', datetime.utcnow()) or \
        datetime.utcnow()
    description = request.params.get('description', '')
    force_detection = request.params.get("force_detection",
                                         'true').lower() == 'true'
    accepted_languages = request.registry.settings['accepted_languages']

    imgstream = request.params['file'].file

    doc_list = request.db.docs

    for image in get_images_from_stream(imgstream):
        try:
            lang, img, text = recognize(image, accepted_languages,
                                        force_detection)
        except TypeError, e:
            err_msg = "Error: " + str(e)
            request.session.flash(err_msg, 'failure')
            return HTTPServerError(explanation=err_msg,
                                   detail='Go back, unselect force '\
                                       'detection, and try again')
        text += " " + description + " " + title
        if text:
            search_terms = list(index(text, [lang]))
        else:
            search_terms = ''
        thumb = get_thumbnail(image)

        doc_list.insert({'img': Binary(image),
                         'thumb': Binary(thumb),
                         'created': created,
                         'version': 4,
                         'forced_detection': force_detection,
                         'language': lang,
                         'keywords': [],
                         'search_terms': search_terms,
                         'title': title})