def delete(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) if config_content["engine"] == 'tesseract': if file_name.endswith('.traineddata'): out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) os.remove(out_file) elif config_content["engine"] == 'calamari': files = os.listdir(os.path.join(os.getcwd(), model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] for out_file in files: os.remove(out_file) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) os.remove(out_file) return redirect( url_for('manage_model_list', trainset=trainset, config=config))
def download(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) print(config_content["engine"]) if config_content["engine"] == 'tesseract': if '.traineddata' in file_name: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) elif config_content["engine"] == "calamari": if file_name != 'report': out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name + '.tar.gz') if os.path.exists(out_file): os.remove(out_file) files = os.listdir(os.path.join(model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] compress_file(files, out_file) file_name += '.tar.gz' else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) return send_file(out_file, attachment_filename=file_name, as_attachment=True)
def manage_model_list(): trainset = request.args.get('trainset', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_list = get_model_list(model_dir) print(file_list) form = SelectEvalForm() form.select_model.choices = get_options(file_list) form.select_test.choices = get_options(get_files()) if request.method == 'POST': print("form:", request.form) data_choices = dict(get_options(get_files())) model_choices = dict(get_options(file_list)) print(get_configs()) select_model = model_choices.get(form.select_model.data) select_test = data_choices.get(form.select_test.data) print('model', select_model) print('test:', select_test) return redirect( url_for('eval_model', trainname=trainset, config=config, testname=select_test, modelname=select_model)) return render_template('model_download.html', form=form, trainname=trainset, config=config, files_list=file_list)
def upload(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) print(config_content["engine"]) if config_content["engine"] == 'tesseract': if '.traineddata' in file_name: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) elif config_content["engine"] == "calamari": if file_name != 'report': out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name + '.tar.gz') if os.path.exists(out_file): os.remove(out_file) files = os.listdir(os.path.join(model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] compress_file(files, out_file) file_name += '.tar.gz' else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) print('uploading...') publish_model( access_token=app.token, model_file=out_file, # local path remote_file='my_image.jpg', # remote name (no path) ocr_engine=config_content[ "engine"], # OCR engine which can run the model license_name= 'WTFPL', # it seems that Zenodo recognizes acronyms, such as this one metadata={ # insert whatever you want in this map 'info': 'this map can contain anything; if you do not want it, set it to none', 'content': 'ideally it should contain all information about the training data, the parameters, the result accuracy, ...', 'usage': 'this gets uploaded as metadata.json along with the model' }, related_DOI=[ ('cites', '123') ], # should other DOI be refered to, add them here as pairs (link, doi), otherwise set this to None is_draft= True # if true, then the publish request will not be sent and the upload will stay as a draft ) print('uploaded!') return redirect( url_for('manage_model_list', trainset=trainset, config=config))
def show_report(): trainset = request.args.get("filename", None) config = request.args.get("config", None) model_dir = get_model_dir(trainset, config) filename = 'report' print(trainset, config, model_dir) text = open(os.path.join(os.getcwd(), model_root, model_dir, filename), 'r', encoding="utf-8") content = text.read() text.close() return render_template('content.html', response=content)
def get_old_traineddata(config): old_config_file = config["continue_from"]["config"] old_config = read_json(os.path.join(config_root, old_config_file)) old_train = config["continue_from"]["trainset"] old_model_dir = get_model_dir(old_train, old_config_file) common_schema = read_json("engines/schemas/common.schema") old_model_prefix = old_config["model_prefix"] if "model_prefix" in old_config \ else common_schema["definitions"]["model_prefix"]["default"] old_traineddata = os.path.join(model_root, old_model_dir, old_model_prefix, old_model_prefix + '.traineddata') return old_traineddata
def translate_continue_path(engine, continue_from): print(continue_from["trainset"], continue_from["config"]) model_dir = get_model_dir(continue_from["trainset"], continue_from["config"]) if engine == 'tesseract': continue_path = os.path.join(model_root, model_dir, 'checkpoint', continue_from["model"]) elif engine == 'calamari': continue_path = os.path.join(model_root, model_dir, continue_from["model"] + '.json') else: continue_path = os.path.join(model_root, model_dir, continue_from["model"]) if not os.path.exists(continue_path): return '' else: return continue_path
def eval_from_file(file_test, file_train, file_config, model_file): clear_data(eval_folder) extract_file(pjoin(data_root, file_test), eval_folder) configs = read_json(pjoin(config_root, file_config)) model_dir = get_model_dir(file_train, file_config) engine = configs["engine"] common_schema = read_json("engines/schemas/common.schema") model_prefix = configs["model_prefix"] if "model_prefix" in configs \ else common_schema["definitions"]["model_prefix"]["default"] cmd_list = [act_environ(engine)] if engine != 'tesseract' else [] if engine == 'tesseract': if model_file.endswith('.traineddata'): best_model = pjoin(model_root, model_dir, model_file) else: best_model = pjoin(model_root, model_dir, 'checkpoint', model_file) else: best_model = pjoin(model_root, model_dir, model_file) if engine == 'kraken': cmd_list.append('kraken -I \'%s/*.png\' -o .txt ocr -m %s -s' % (eval_folder, best_model)) elif engine == 'calamari': cmd_list.append('calamari-predict --checkpoint %s --files %s/*.png' % (best_model, eval_folder)) elif engine == 'ocropus': cmd_list.append('ocropus-rpred -m %s \'%s/*.png\'' % (best_model, eval_folder)) elif engine == 'tesseract': cmd_list.append('export TESSDATA_PREFIX=%s' % pjoin(model_root, model_dir)) print('model_dir', pjoin(model_root, model_dir, 'checkpoint')) if os.path.exists(pjoin(model_root, model_dir, 'checkpoint')): cmd_list.append('/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s' % (best_model, pjoin(model_root, model_dir, model_prefix, '%s.traineddata' % model_prefix), pjoin(model_root, model_dir, model_prefix + '.traineddata'))) convert_image('engines/eval') image_files = get_all_files(data_folder=eval_folder, postfix='.tif') for imf in image_files: cmd_list.append('tesseract -l %s %s/%s.tif %s/%s ' % (model_prefix, eval_folder, imf, eval_folder, imf)) if engine != 'tesseract': cmd_list.append(deact_environ) cmd = '\n'.join(cmd_list) print(cmd_list) subprocess.run(cmd, shell=True) gt_files = [eval_folder + '/' + ele for ele in os.listdir(eval_folder) if ele.endswith('.gt.txt')] if engine == 'calamari': res = evaluate(gt_files, flag_confusion=1, extension='.pred.txt') res_files = [os.getcwd() + '/' + ele[:-len(".gt.txt")] + '.pred.txt' for ele in gt_files] else: res = evaluate(gt_files, flag_confusion=1) res_files = [os.getcwd() + '/' + ele[:-len(".gt.txt")] + '.txt' for ele in gt_files] report_file = add_eval_report(file_test, file_train, file_config, model_file) out_file = pjoin(os.getcwd(), eval_root, report_file + '.tar.gz') compress_file(res_files, out_file) with open(pjoin(eval_root, report_file), 'w') as f_: f_.write('\nTotal characters:\t%d\n' % res["char_total"]) f_.write('Character errors:\t%d\n' % res["char_errs"]) f_.write('Character error rate:\t%.3f\n' % res["char_error_rate"]) f_.write('Total words:\t%d\n' % res["word_total"]) f_.write('Word errors:\t%d\n' % res["word_errs"]) f_.write('Word error rate:\t%.3f\n\n\n' % res["word_error_rate"]) f_.write('count\tgenerated\tcorrect\n') for v, a, b in res["confusion"]: f_.write("%d\t%s\t%s\n" % (v, a, b)) return report_file
def validate_continue_from(continue_from_schema, new_config): err_str = [] engine = new_config["engine"] # check whether continue from parameter is valid against the engine schema resolver = RefResolver('file://%s/engines/schemas/' % os.getcwd(), None) validator = Draft4Validator(continue_from_schema, resolver=resolver) continue_from = new_config["continue_from"] for error in validator.iter_errors(continue_from): err_str.append('parameter continue_from, %s' % error.message) if len(err_str) > 0: return err_str # check whether engine matches if not os.path.exists(os.path.join(config_root, continue_from["config"])): err_str.append( 'parameter continue_from, configuration for the old model does not exists' ) return err_str old_config = read_json(os.path.join(config_root, continue_from["config"])) if old_config["engine"] != new_config["engine"]: err_str.append( 'parameter engine, engines for old model and new model not match') return err_str # check whether the path to continue from exists model_dir = get_model_dir(continue_from["trainset"], continue_from["config"]) if engine == 'tesseract': model_path = os.path.join(model_root, model_dir, 'checkpoint', continue_from["model"]) else: model_path = os.path.join(model_root, model_dir, continue_from["model"]) if engine == 'calamari': model_path += '.json' if not os.path.exists(model_path): err_str.append('parameter continue_from, model does not exist') # check whether the model structure is right if "model" in new_config: if engine == 'calamari': if new_config['model'] != old_config["model"]: err_str.append( 'parameter model, old model and new model must match for calamari.' ) return err_str elif engine == 'ocropus': err_str.append( 'parameters model, ocropus does not support new model structure for fine tuning.' ) return err_str if "append" in new_config: append_index = new_config["append"] if "append" not in new_config or append_index < 1: err_str.append( 'parameter append, please assign a valid append') new_model = new_config["model"] old_model = old_config["model"] len_old_model = len( old_model) if "input" in old_model[0] else len(old_model) + 1 len_old_model = len_old_model - 1 if "output" in old_model[ -1] else len_old_model if append_index >= len_old_model: err_str.append( 'parameter append, append_index must be less than the number of layers (excluding output layer, including input layer).' ) concat_model = old_model[:append_index] if "input" in old_model[ 0] else old_model[:append_index - 1] concat_model += new_model else: if "append" in new_config: err_str.append( 'parameter append, please specify the model structure to append.' ) if engine == 'calamari': if "model" in old_config: err_str.append( 'parameter model, old model and new model must match for calamari' ) return err_str
def valid_from_file(file_train, file_config): configs = read_json(pjoin('static/configs', file_config)) model_dir = get_model_dir(file_train, file_config) engine = configs["engine"] common_schema = read_json("engines/schemas/common.schema") model_prefix = configs["model_prefix"] if "model_prefix" in configs \ else common_schema["definitions"]["model_prefix"]["default"] # noinspection PyInterpreter dict_res, best_perform, best_model = read_report(model_dir) # write the evaluation report f_out = open(pjoin(model_root, model_dir, 'report'), 'w') model_postfix = get_model_postfixes(engine, model_dir, model_prefix) if len(model_postfix) == 0: return 'No model yet.' for index in model_postfix: if int(index) not in dict_res: model_file = get_model_path(engine, model_prefix, index) model_path = pjoin(model_root, model_dir, model_file) if engine == 'tesseract': cmd_list = [ 'export TESSDATA_PREFIX=%s' % pjoin(os.getcwd(), model_root, model_dir), '/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s' % (model_path, pjoin(model_root, model_dir, model_prefix, '%s.traineddata' % model_prefix), pjoin(model_root, model_dir, model_prefix + '.traineddata')) ] convert_image(valid_folder) image_files = get_all_files(data_folder=valid_folder, postfix='.tif') for imf in image_files: cmd_list.append( 'tesseract -l %s %s/%s.tif %s/%s ' % (model_prefix, valid_folder, imf, valid_folder, imf)) else: cmd_list = [act_environ(engine)] cmd_list.append(get_cmd(engine, model_path)) cmd_list.append('conda deactivate') cmd = '\n'.join(cmd_list) subprocess.run(cmd, shell=True) gt_files = [ valid_folder + '/' + ele for ele in os.listdir(valid_folder) if ele.endswith('.gt.txt') ] if engine == 'calamari': res_str = evaluate(gt_files, flag_confusion=0, extension='.pred.txt') else: res_str = evaluate(gt_files, flag_confusion=0) if float(res_str["char_error_rate"]) > best_perform: best_perform = float(res_str["char_error_rate"]) best_model = index f_out.write( 'Iteration: %s, character errors: %d, total characters: %d, char error rate: %s, word errors: %d, total words: %d, word error rate: %s\n' % (index, res_str["char_errs"], res_str["char_total"], res_str["char_error_rate"], res_str["word_errs"], res_str["word_total"], res_str["word_error_rate"])) else: res_str = dict_res[int(index)] f_out.write('Iteration: %s, %s\n' % (res_str[1], res_str[0])) copy_best_model(engine, model_dir, model_prefix, best_model) # from_file('train_500.tar.gz', 'sample_calamari.json') # eval_from_file(model_dir='tess_new', engine='tesseract', model_prefix='tess')