示例#1
0
def delete():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    if config_content["engine"] == 'tesseract':
        if file_name.endswith('.traineddata'):
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
        os.remove(out_file)
    elif config_content["engine"] == 'calamari':
        files = os.listdir(os.path.join(os.getcwd(), model_root, model_dir))
        files = [
            os.path.join(model_root, model_dir, ele) for ele in files
            if ele.startswith(file_name)
        ]
        for out_file in files:
            os.remove(out_file)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
        os.remove(out_file)
    return redirect(
        url_for('manage_model_list', trainset=trainset, config=config))
示例#2
0
def download():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    print(config_content["engine"])
    if config_content["engine"] == 'tesseract':
        if '.traineddata' in file_name:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
    elif config_content["engine"] == "calamari":
        if file_name != 'report':
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name + '.tar.gz')
            if os.path.exists(out_file):
                os.remove(out_file)
            files = os.listdir(os.path.join(model_root, model_dir))
            files = [
                os.path.join(model_root, model_dir, ele) for ele in files
                if ele.startswith(file_name)
            ]
            compress_file(files, out_file)
            file_name += '.tar.gz'
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
    return send_file(out_file,
                     attachment_filename=file_name,
                     as_attachment=True)
示例#3
0
def manage_model_list():
    trainset = request.args.get('trainset', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_list = get_model_list(model_dir)
    print(file_list)
    form = SelectEvalForm()
    form.select_model.choices = get_options(file_list)
    form.select_test.choices = get_options(get_files())
    if request.method == 'POST':
        print("form:", request.form)
        data_choices = dict(get_options(get_files()))
        model_choices = dict(get_options(file_list))
        print(get_configs())
        select_model = model_choices.get(form.select_model.data)

        select_test = data_choices.get(form.select_test.data)
        print('model', select_model)
        print('test:', select_test)
        return redirect(
            url_for('eval_model',
                    trainname=trainset,
                    config=config,
                    testname=select_test,
                    modelname=select_model))
    return render_template('model_download.html',
                           form=form,
                           trainname=trainset,
                           config=config,
                           files_list=file_list)
示例#4
0
def upload():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    print(config_content["engine"])
    if config_content["engine"] == 'tesseract':
        if '.traineddata' in file_name:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
    elif config_content["engine"] == "calamari":
        if file_name != 'report':
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name + '.tar.gz')
            if os.path.exists(out_file):
                os.remove(out_file)
            files = os.listdir(os.path.join(model_root, model_dir))
            files = [
                os.path.join(model_root, model_dir, ele) for ele in files
                if ele.startswith(file_name)
            ]
            compress_file(files, out_file)
            file_name += '.tar.gz'
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
    print('uploading...')
    publish_model(
        access_token=app.token,
        model_file=out_file,  # local path
        remote_file='my_image.jpg',  # remote name (no path)
        ocr_engine=config_content[
            "engine"],  # OCR engine which can run the model
        license_name=
        'WTFPL',  # it seems that Zenodo recognizes acronyms, such as this one
        metadata={  # insert whatever you want in this map
            'info':
            'this map can contain anything; if you do not want it, set it to none',
            'content':
            'ideally it should contain all information about the training data, the parameters, the result accuracy, ...',
            'usage': 'this gets uploaded as metadata.json along with the model'
        },
        related_DOI=[
            ('cites', '123')
        ],  # should other DOI be refered to, add them here as pairs (link, doi), otherwise set this to None
        is_draft=
        True  # if true, then the publish request will not be sent and the upload will stay as a draft
    )
    print('uploaded!')
    return redirect(
        url_for('manage_model_list', trainset=trainset, config=config))
示例#5
0
文件: routes.py 项目: stweil/okralact
def show_report():
    trainset = request.args.get("filename", None)
    config = request.args.get("config", None)
    model_dir = get_model_dir(trainset, config)
    filename = 'report'
    print(trainset, config, model_dir)
    text = open(os.path.join(os.getcwd(), model_root, model_dir, filename), 'r', encoding="utf-8")
    content = text.read()
    text.close()
    return render_template('content.html', response=content)
示例#6
0
def get_old_traineddata(config):
    old_config_file = config["continue_from"]["config"]
    old_config = read_json(os.path.join(config_root, old_config_file))
    old_train = config["continue_from"]["trainset"]
    old_model_dir = get_model_dir(old_train, old_config_file)
    common_schema = read_json("engines/schemas/common.schema")
    old_model_prefix = old_config["model_prefix"] if "model_prefix" in old_config \
        else common_schema["definitions"]["model_prefix"]["default"]
    old_traineddata = os.path.join(model_root, old_model_dir, old_model_prefix,
                                   old_model_prefix + '.traineddata')
    return old_traineddata
示例#7
0
def translate_continue_path(engine, continue_from):
    print(continue_from["trainset"], continue_from["config"])
    model_dir = get_model_dir(continue_from["trainset"],
                              continue_from["config"])
    if engine == 'tesseract':
        continue_path = os.path.join(model_root, model_dir, 'checkpoint',
                                     continue_from["model"])
    elif engine == 'calamari':
        continue_path = os.path.join(model_root, model_dir,
                                     continue_from["model"] + '.json')
    else:
        continue_path = os.path.join(model_root, model_dir,
                                     continue_from["model"])

    if not os.path.exists(continue_path):
        return ''
    else:
        return continue_path
示例#8
0
文件: eval.py 项目: stweil/okralact
def eval_from_file(file_test, file_train, file_config,  model_file):
    clear_data(eval_folder)
    extract_file(pjoin(data_root, file_test), eval_folder)
    configs = read_json(pjoin(config_root, file_config))
    model_dir = get_model_dir(file_train, file_config)

    engine = configs["engine"]
    common_schema = read_json("engines/schemas/common.schema")
    model_prefix = configs["model_prefix"] if "model_prefix" in configs \
        else common_schema["definitions"]["model_prefix"]["default"]
    cmd_list = [act_environ(engine)] if engine != 'tesseract' else []
    if engine == 'tesseract':
        if model_file.endswith('.traineddata'):
            best_model = pjoin(model_root, model_dir, model_file)
        else:
            best_model = pjoin(model_root, model_dir, 'checkpoint', model_file)
    else:
        best_model = pjoin(model_root, model_dir, model_file)
    if engine == 'kraken':
        cmd_list.append('kraken -I \'%s/*.png\' -o .txt ocr -m %s -s'
                        % (eval_folder,  best_model))
    elif engine == 'calamari':
        cmd_list.append('calamari-predict --checkpoint %s --files %s/*.png'
                        % (best_model, eval_folder))
    elif engine == 'ocropus':
        cmd_list.append('ocropus-rpred -m %s \'%s/*.png\'' % (best_model, eval_folder))
    elif engine == 'tesseract':
        cmd_list.append('export TESSDATA_PREFIX=%s' % pjoin(model_root, model_dir))
        print('model_dir', pjoin(model_root, model_dir, 'checkpoint'))
        if os.path.exists(pjoin(model_root, model_dir, 'checkpoint')):
            cmd_list.append('/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s' %
                            (best_model,
                             pjoin(model_root, model_dir, model_prefix, '%s.traineddata' % model_prefix),
                             pjoin(model_root, model_dir, model_prefix + '.traineddata')))
        convert_image('engines/eval')
        image_files = get_all_files(data_folder=eval_folder, postfix='.tif')
        for imf in image_files:
            cmd_list.append('tesseract -l %s %s/%s.tif %s/%s ' % (model_prefix,
                                                                  eval_folder,
                                                                  imf,
                                                                  eval_folder,
                                                                  imf))
    if engine != 'tesseract':
        cmd_list.append(deact_environ)
    cmd = '\n'.join(cmd_list)
    print(cmd_list)
    subprocess.run(cmd, shell=True)
    gt_files = [eval_folder + '/' + ele for ele in os.listdir(eval_folder) if ele.endswith('.gt.txt')]
    if engine == 'calamari':
        res = evaluate(gt_files,  flag_confusion=1, extension='.pred.txt')
        res_files = [os.getcwd()  +  '/' +  ele[:-len(".gt.txt")] +  '.pred.txt' for ele in gt_files]
    else:
        res = evaluate(gt_files, flag_confusion=1)
        res_files = [os.getcwd()  +  '/' + ele[:-len(".gt.txt")] +  '.txt' for ele in gt_files]
    report_file = add_eval_report(file_test, file_train, file_config, model_file)
    out_file =  pjoin(os.getcwd(), eval_root, report_file  +  '.tar.gz')
    compress_file(res_files,  out_file)
    with open(pjoin(eval_root, report_file), 'w') as f_:
        f_.write('\nTotal characters:\t%d\n' % res["char_total"])
        f_.write('Character errors:\t%d\n' % res["char_errs"])
        f_.write('Character error rate:\t%.3f\n' % res["char_error_rate"])
        f_.write('Total words:\t%d\n' % res["word_total"])
        f_.write('Word errors:\t%d\n' % res["word_errs"])
        f_.write('Word error rate:\t%.3f\n\n\n' % res["word_error_rate"])
        f_.write('count\tgenerated\tcorrect\n')
        for v, a, b  in res["confusion"]:
            f_.write("%d\t%s\t%s\n" % (v, a, b))
    return report_file
示例#9
0
def validate_continue_from(continue_from_schema, new_config):
    err_str = []
    engine = new_config["engine"]
    # check whether continue from parameter is valid against the engine schema
    resolver = RefResolver('file://%s/engines/schemas/' % os.getcwd(), None)
    validator = Draft4Validator(continue_from_schema, resolver=resolver)
    continue_from = new_config["continue_from"]
    for error in validator.iter_errors(continue_from):
        err_str.append('parameter continue_from, %s' % error.message)
    if len(err_str) > 0:
        return err_str
    # check whether engine matches
    if not os.path.exists(os.path.join(config_root, continue_from["config"])):
        err_str.append(
            'parameter continue_from, configuration for the old model does not exists'
        )
        return err_str
    old_config = read_json(os.path.join(config_root, continue_from["config"]))
    if old_config["engine"] != new_config["engine"]:
        err_str.append(
            'parameter engine, engines for old model and new model not match')
        return err_str
    # check whether the path to continue from exists
    model_dir = get_model_dir(continue_from["trainset"],
                              continue_from["config"])
    if engine == 'tesseract':
        model_path = os.path.join(model_root, model_dir, 'checkpoint',
                                  continue_from["model"])
    else:
        model_path = os.path.join(model_root, model_dir,
                                  continue_from["model"])

    if engine == 'calamari':
        model_path += '.json'
    if not os.path.exists(model_path):
        err_str.append('parameter continue_from, model does not exist')
    # check whether the model structure is right
    if "model" in new_config:
        if engine == 'calamari':
            if new_config['model'] != old_config["model"]:
                err_str.append(
                    'parameter model, old model and new model must match for calamari.'
                )
                return err_str
        elif engine == 'ocropus':
            err_str.append(
                'parameters model, ocropus does not support new model structure for fine tuning.'
            )
            return err_str
        if "append" in new_config:
            append_index = new_config["append"]
            if "append" not in new_config or append_index < 1:
                err_str.append(
                    'parameter append, please assign a valid append')
            new_model = new_config["model"]
            old_model = old_config["model"]
            len_old_model = len(
                old_model) if "input" in old_model[0] else len(old_model) + 1
            len_old_model = len_old_model - 1 if "output" in old_model[
                -1] else len_old_model
            if append_index >= len_old_model:
                err_str.append(
                    'parameter append, append_index must be less than the number of layers (excluding output layer, including input layer).'
                )
            concat_model = old_model[:append_index] if "input" in old_model[
                0] else old_model[:append_index - 1]
            concat_model += new_model
    else:
        if "append" in new_config:
            err_str.append(
                'parameter append, please specify the model structure to append.'
            )
        if engine == 'calamari':
            if "model" in old_config:
                err_str.append(
                    'parameter model, old model and new model must match for calamari'
                )
    return err_str
示例#10
0
def valid_from_file(file_train, file_config):
    configs = read_json(pjoin('static/configs', file_config))
    model_dir = get_model_dir(file_train, file_config)
    engine = configs["engine"]
    common_schema = read_json("engines/schemas/common.schema")
    model_prefix = configs["model_prefix"] if "model_prefix" in configs \
        else common_schema["definitions"]["model_prefix"]["default"]
    # noinspection PyInterpreter
    dict_res, best_perform, best_model = read_report(model_dir)
    # write the evaluation report
    f_out = open(pjoin(model_root, model_dir, 'report'), 'w')
    model_postfix = get_model_postfixes(engine, model_dir, model_prefix)
    if len(model_postfix) == 0:
        return 'No model yet.'
    for index in model_postfix:
        if int(index) not in dict_res:
            model_file = get_model_path(engine, model_prefix, index)
            model_path = pjoin(model_root, model_dir, model_file)
            if engine == 'tesseract':
                cmd_list = [
                    'export TESSDATA_PREFIX=%s' %
                    pjoin(os.getcwd(), model_root, model_dir),
                    '/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s'
                    % (model_path,
                       pjoin(model_root, model_dir, model_prefix,
                             '%s.traineddata' % model_prefix),
                       pjoin(model_root, model_dir,
                             model_prefix + '.traineddata'))
                ]
                convert_image(valid_folder)
                image_files = get_all_files(data_folder=valid_folder,
                                            postfix='.tif')
                for imf in image_files:
                    cmd_list.append(
                        'tesseract -l %s %s/%s.tif %s/%s ' %
                        (model_prefix, valid_folder, imf, valid_folder, imf))
            else:
                cmd_list = [act_environ(engine)]
                cmd_list.append(get_cmd(engine, model_path))
                cmd_list.append('conda deactivate')
            cmd = '\n'.join(cmd_list)
            subprocess.run(cmd, shell=True)
            gt_files = [
                valid_folder + '/' + ele for ele in os.listdir(valid_folder)
                if ele.endswith('.gt.txt')
            ]
            if engine == 'calamari':
                res_str = evaluate(gt_files,
                                   flag_confusion=0,
                                   extension='.pred.txt')
            else:
                res_str = evaluate(gt_files, flag_confusion=0)
            if float(res_str["char_error_rate"]) > best_perform:
                best_perform = float(res_str["char_error_rate"])
                best_model = index
            f_out.write(
                'Iteration: %s,  character  errors: %d, total characters: %d, char error rate: %s, word errors: %d, total words: %d, word error rate: %s\n'
                % (index, res_str["char_errs"], res_str["char_total"],
                   res_str["char_error_rate"], res_str["word_errs"],
                   res_str["word_total"], res_str["word_error_rate"]))
        else:
            res_str = dict_res[int(index)]
            f_out.write('Iteration: %s, %s\n' % (res_str[1], res_str[0]))
    copy_best_model(engine, model_dir, model_prefix, best_model)


# from_file('train_500.tar.gz', 'sample_calamari.json')
# eval_from_file(model_dir='tess_new', engine='tesseract', model_prefix='tess')