示例#1
0
def read_parameters(config_file):
    configs = read_json(config_file)
    engine = configs["engine"]
    schema = read_json('engines/schemas/%s.schema' % engine)
    new_configs = read_parameter_from_schema(schema)
    for attr in new_configs:
        if attr in configs:
            new_configs[attr] = configs[attr]
    # updated_configs = Config(new_configs)
    return new_configs
示例#2
0
def get_old_traineddata(config):
    old_config_file = config["continue_from"]["config"]
    old_config = read_json(os.path.join(config_root, old_config_file))
    old_train = config["continue_from"]["trainset"]
    old_model_dir = get_model_dir(old_train, old_config_file)
    common_schema = read_json("engines/schemas/common.schema")
    old_model_prefix = old_config["model_prefix"] if "model_prefix" in old_config \
        else common_schema["definitions"]["model_prefix"]["default"]
    old_traineddata = os.path.join(model_root, old_model_dir, old_model_prefix,
                                   old_model_prefix + '.traineddata')
    return old_traineddata
示例#3
0
文件: common.py 项目: stweil/okralact
def read_layer_info(layername):
    layer_schema = read_json("engines/schemas/models/layer_%s.schema" %
                             layername)
    default_values = {}
    for key in layer_schema["definitions"]:
        default_values[key] = layer_schema["definitions"][key]["default"]
    return default_values
示例#4
0
文件: common.py 项目: stweil/okralact
def load_jsonref(ref_path):
    ref_file, ref_propty, ref_attr = ref_path.rsplit('/', 2)
    ref_file = ref_file[:-1]
    schema_path = '%s/engines/schemas/%s' % (os.getcwd(), ref_file)
    print(schema_path)
    ref_schema = read_json(schema_path)
    return ref_schema[ref_propty][ref_attr]
示例#5
0
def validate_model(model, engine):
    err_str = []
    if type(model) is not list:
        err_str.append('parameter model, model should be a list of arrays')
        return err_str
    layers = read_json('engines/schemas/models/layer_all_%s.schema' %
                       engine)["definitions"]
    layer_no = 0
    for ele in model:
        if type(ele) is not dict:
            err_str.append('parameter model, layer %d must be a dictionary' %
                           layer_no)
            continue
        if len(ele.keys()) != 1:
            err_str.append(
                'parameter model, layer %d must only contains one layer' %
                layer_no)
            continue
        for key in ele:
            if key not in layers:
                err_str.append(
                    'parameter model, layer %s is not defined in the model of engine %s'
                    % (key, engine))
            else:
                resolver = RefResolver(
                    'file://%s/engines/schemas/models/' % os.getcwd(), None)
                validator = Draft4Validator(layers[key], resolver=resolver)
                for error in validator.iter_errors(ele):
                    err_str.append('parameter model, layer %s, %s' %
                                   (key, error.message))
        layer_no += 1
    return err_str
示例#6
0
def delete():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    if config_content["engine"] == 'tesseract':
        if file_name.endswith('.traineddata'):
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
        os.remove(out_file)
    elif config_content["engine"] == 'calamari':
        files = os.listdir(os.path.join(os.getcwd(), model_root, model_dir))
        files = [
            os.path.join(model_root, model_dir, ele) for ele in files
            if ele.startswith(file_name)
        ]
        for out_file in files:
            os.remove(out_file)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
        os.remove(out_file)
    return redirect(
        url_for('manage_model_list', trainset=trainset, config=config))
示例#7
0
def download():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    print(config_content["engine"])
    if config_content["engine"] == 'tesseract':
        if '.traineddata' in file_name:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
    elif config_content["engine"] == "calamari":
        if file_name != 'report':
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name + '.tar.gz')
            if os.path.exists(out_file):
                os.remove(out_file)
            files = os.listdir(os.path.join(model_root, model_dir))
            files = [
                os.path.join(model_root, model_dir, ele) for ele in files
                if ele.startswith(file_name)
            ]
            compress_file(files, out_file)
            file_name += '.tar.gz'
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
    return send_file(out_file,
                     attachment_filename=file_name,
                     as_attachment=True)
示例#8
0
文件: common.py 项目: stweil/okralact
def read_model_param(engine):
    engine_schema = read_json('engines/schemas/models/model_%s.schema' %
                              engine)
    model = engine_schema["definitions"]["model"]
    layers = model["items"]["oneOf"]
    help_info = []
    for layer in layers:
        layer_def = load_jsonref('models/' + layer["$ref"])["properties"]
        for key in layer_def:
            cur_layer = layer_def[key]
            help_info.append(
                [key, '', "dictionary", '', cur_layer["description"]])
            for para in layer_def[key]["properties"]:
                para_def = load_jsonref("models/" +
                                        cur_layer["properties"][para]["$ref"])
                # print(para_def)
                if "enum" in para_def:
                    help_info.append([
                        '', para, para_def["type"], para_def["default"],
                        'Allowed values: ' +
                        ', '.join(map(str, para_def["enum"])) + '. ' +
                        para_def["description"]
                    ])
                else:
                    help_info.append([
                        '', para, para_def["type"], para_def["default"],
                        para_def["description"]
                    ])
    return help_info
示例#9
0
def upload():
    trainset = request.args.get('trainname', None)
    config = request.args.get('config', None)
    model_dir = get_model_dir(trainset, config)
    file_name = request.args.get("filename", None)
    config_content = read_json(os.path.join(config_root, config))
    print(config_content["engine"])
    if config_content["engine"] == 'tesseract':
        if '.traineddata' in file_name:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    "checkpoint", file_name)
    elif config_content["engine"] == "calamari":
        if file_name != 'report':
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name + '.tar.gz')
            if os.path.exists(out_file):
                os.remove(out_file)
            files = os.listdir(os.path.join(model_root, model_dir))
            files = [
                os.path.join(model_root, model_dir, ele) for ele in files
                if ele.startswith(file_name)
            ]
            compress_file(files, out_file)
            file_name += '.tar.gz'
        else:
            out_file = os.path.join(os.getcwd(), model_root, model_dir,
                                    file_name)
    else:
        out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name)
    print('uploading...')
    publish_model(
        access_token=app.token,
        model_file=out_file,  # local path
        remote_file='my_image.jpg',  # remote name (no path)
        ocr_engine=config_content[
            "engine"],  # OCR engine which can run the model
        license_name=
        'WTFPL',  # it seems that Zenodo recognizes acronyms, such as this one
        metadata={  # insert whatever you want in this map
            'info':
            'this map can contain anything; if you do not want it, set it to none',
            'content':
            'ideally it should contain all information about the training data, the parameters, the result accuracy, ...',
            'usage': 'this gets uploaded as metadata.json along with the model'
        },
        related_DOI=[
            ('cites', '123')
        ],  # should other DOI be refered to, add them here as pairs (link, doi), otherwise set this to None
        is_draft=
        True  # if true, then the publish request will not be sent and the upload will stay as a draft
    )
    print('uploaded!')
    return redirect(
        url_for('manage_model_list', trainset=trainset, config=config))
示例#10
0
def read_parameter_default(engine):
    common_schema = read_json("engines/schemas/common.schema")
    default_values = {}
    for key in common_schema["definitions"]:
        if common_schema["definitions"][key]["type"] != "object":
            default_values[key] = common_schema["definitions"][key]["default"]
    engine_schema = read_json("engines/schemas/engine_%s.schema" % engine)
    for key in engine_schema["properties"]:
        if "$ref" not in engine_schema["properties"][key]:
            if engine_schema["properties"][key]["type"] == "object":
                for ele in engine_schema["properties"][key]["properties"]:
                    new_key = '%s_%s' % (key, ele)
                    default_values[new_key] = engine_schema["properties"][key][
                        "properties"][ele]["default"]
            else:
                default_values[key] = engine_schema["properties"][key][
                    "default"]
    return default_values
示例#11
0
def read_help_information(engine):
    schema = read_json('schemas/%s.schema' % engine)
    attrs = schema['properties']
    help_info = 'OCR Engine: %s\n' % engine
    help_info += 'Parameters: \n'
    for k in attrs:
        help_info += '\t --%s\tdefault:[%s]\t%s\n' % (
            k, str(attrs[k]["default"]), attrs[k]["description"])
    return help_info
示例#12
0
    def __init__(self, file_config, model_dir):
        self.configs = read_json(pjoin(config_root, file_config))
        self.engine = self.configs["engine"]
        self.model_dir = model_dir

        self.translator = read_json('engines/schemas/translate.json')[
            self.engine]

        # load default values
        # self.default = read_parameter_default(self.engine)
        # replace default values with user specified values
        self.values = read_value(self.configs, self.engine)
        if "continue_from" in self.configs:
            print('transalte path')
            self.values["continue_from"] = translate_continue_path(
                self.configs["engine"], self.configs["continue_from"])
        else:
            self.values["continue_from"] = ''

        if "model" in self.configs:
            self.model_translator = ModelTranslator(self.configs["model"],
                                                    self.engine)
        elif "continue_from" not in self.configs:
            self.configs["model"] = read_json(
                'engines/schemas/models/default_model_%s.schema' % self.engine)
            self.model_translator = ModelTranslator(self.configs["model"],
                                                    self.engine)
        self.model_prefix = self.values["model_prefix"]
        self.nepoch = self.values['nepoch']
        partition = self.values['partition']
        self.ntrain, self.ntest = split_train_test(data_folder,
                                                   tmp_folder,
                                                   partition,
                                                   engine=self.engine)

        self.translate()
        self.cmd_list = [act_environ(self.engine)] + self.cmd_list + [deact_environ]\
            if self.engine != 'tesseract' else self.cmd_list
示例#13
0
文件: common.py 项目: stweil/okralact
def read_model_info(engine):
    engine_schema = read_json('engines/schemas/models/model_%s.schema' %
                              engine)
    model = engine_schema["definitions"]["model"]
    layers = model["items"]["oneOf"]
    help_info = []
    help_info.append(["model", '', model["type"], '', model["description"]])
    for layer in layers:
        layer_def = load_jsonref('models/' + layer["$ref"])["properties"]
        for key in layer_def:
            cur_layer = layer_def[key]
            help_info.append(
                ['', key, "dictionary", '', cur_layer["description"]])
    return help_info
示例#14
0
def validate_string(config_str):
    config, err = read_config_str(config_str)  # read configuration file
    if len(err) > 0:
        errors = ['Configuration file is not valid dictionary.']
        for e in err:
            errors.append('\t' + e)
        return errors
    common_schema = read_json(
        'engines/schemas/common.schema')  # valid against common schema
    errors = validate(common_schema, config)
    if len(errors) > 0:
        return errors
    errors = []
    engine = config["engine"]
    engine_schema = read_json('engines/schemas/engine_%s.schema' % engine)
    errors_model = []
    if "model" in config:  # valid against model schema
        model = config["model"]
        errors_model = validate_model(model, engine)
    errors += errors_model
    if "continue_from" in config:
        errors += validate_continue_from(
            engine_schema["properties"]["continue_from"], config)
        del config["continue_from"]
    elif "append" in config:
        errors += [
            'parameter append, please specify the model structure to append.'
        ]
    #     errors +=  errors_continue
    if "model" in config:
        del config["model"]
    errors += validate(engine_schema, config)  # valid against engine schema
    nerr = len(errors)
    for i in range(nerr):
        errors[i] = 'Error %d: %s' % (i, errors[i])
    return errors
示例#15
0
文件: common.py 项目: stweil/okralact
def read_help_information_html(engine):
    schema = read_json('engines/schemas/engine_%s.schema' % engine)
    attrs = schema['properties']
    help_info = []
    for k in attrs:
        print(k, attrs[k])
        if k == "model":
            help_info += read_model_info(engine)
            continue
        elif "$ref" in attrs[k]:
            ref_path = attrs[k]["$ref"]
            cur_node = load_jsonref(ref_path)
            if cur_node["type"] == "object":
                help_info += read_object(k, cur_node)
                continue
        elif attrs[k]["type"] == "object":
            cur_node = attrs[k]
            help_info += read_object(k, cur_node)
            continue
        else:
            cur_node = attrs[k]
        if cur_node["type"] == "number":
            help_info.append([
                k, '', cur_node["format"],
                str(cur_node["default"]), cur_node["description"]
            ])
        else:
            if "enum" in cur_node:
                help_info.append([
                    k, '', cur_node["type"],
                    str(cur_node["default"]),
                    "Allowed Value: " + ', '.join(cur_node["enum"]) + '. ' +
                    cur_node["description"]
                ])
            else:
                help_info.append([
                    k, '', cur_node["type"],
                    str(cur_node["default"]), cur_node["description"]
                ])
    return help_info
示例#16
0
def valid_from_file(file_train, file_config):
    configs = read_json(pjoin('static/configs', file_config))
    model_dir = get_model_dir(file_train, file_config)
    engine = configs["engine"]
    common_schema = read_json("engines/schemas/common.schema")
    model_prefix = configs["model_prefix"] if "model_prefix" in configs \
        else common_schema["definitions"]["model_prefix"]["default"]
    # noinspection PyInterpreter
    dict_res, best_perform, best_model = read_report(model_dir)
    # write the evaluation report
    f_out = open(pjoin(model_root, model_dir, 'report'), 'w')
    model_postfix = get_model_postfixes(engine, model_dir, model_prefix)
    if len(model_postfix) == 0:
        return 'No model yet.'
    for index in model_postfix:
        if int(index) not in dict_res:
            model_file = get_model_path(engine, model_prefix, index)
            model_path = pjoin(model_root, model_dir, model_file)
            if engine == 'tesseract':
                cmd_list = [
                    'export TESSDATA_PREFIX=%s' %
                    pjoin(os.getcwd(), model_root, model_dir),
                    '/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s'
                    % (model_path,
                       pjoin(model_root, model_dir, model_prefix,
                             '%s.traineddata' % model_prefix),
                       pjoin(model_root, model_dir,
                             model_prefix + '.traineddata'))
                ]
                convert_image(valid_folder)
                image_files = get_all_files(data_folder=valid_folder,
                                            postfix='.tif')
                for imf in image_files:
                    cmd_list.append(
                        'tesseract -l %s %s/%s.tif %s/%s ' %
                        (model_prefix, valid_folder, imf, valid_folder, imf))
            else:
                cmd_list = [act_environ(engine)]
                cmd_list.append(get_cmd(engine, model_path))
                cmd_list.append('conda deactivate')
            cmd = '\n'.join(cmd_list)
            subprocess.run(cmd, shell=True)
            gt_files = [
                valid_folder + '/' + ele for ele in os.listdir(valid_folder)
                if ele.endswith('.gt.txt')
            ]
            if engine == 'calamari':
                res_str = evaluate(gt_files,
                                   flag_confusion=0,
                                   extension='.pred.txt')
            else:
                res_str = evaluate(gt_files, flag_confusion=0)
            if float(res_str["char_error_rate"]) > best_perform:
                best_perform = float(res_str["char_error_rate"])
                best_model = index
            f_out.write(
                'Iteration: %s,  character  errors: %d, total characters: %d, char error rate: %s, word errors: %d, total words: %d, word error rate: %s\n'
                % (index, res_str["char_errs"], res_str["char_total"],
                   res_str["char_error_rate"], res_str["word_errs"],
                   res_str["word_total"], res_str["word_error_rate"]))
        else:
            res_str = dict_res[int(index)]
            f_out.write('Iteration: %s, %s\n' % (res_str[1], res_str[0]))
    copy_best_model(engine, model_dir, model_prefix, best_model)


# from_file('train_500.tar.gz', 'sample_calamari.json')
# eval_from_file(model_dir='tess_new', engine='tesseract', model_prefix='tess')
示例#17
0
def process_kraken_reshape_size(config):
    old_model = read_json(
        os.path.join(config_root, config["continue_from"]["config"]))["model"]
    input_size = get_old_input_size(old_model, config["append"])
    return input_size
示例#18
0
def validate_continue_from(continue_from_schema, new_config):
    err_str = []
    engine = new_config["engine"]
    # check whether continue from parameter is valid against the engine schema
    resolver = RefResolver('file://%s/engines/schemas/' % os.getcwd(), None)
    validator = Draft4Validator(continue_from_schema, resolver=resolver)
    continue_from = new_config["continue_from"]
    for error in validator.iter_errors(continue_from):
        err_str.append('parameter continue_from, %s' % error.message)
    if len(err_str) > 0:
        return err_str
    # check whether engine matches
    if not os.path.exists(os.path.join(config_root, continue_from["config"])):
        err_str.append(
            'parameter continue_from, configuration for the old model does not exists'
        )
        return err_str
    old_config = read_json(os.path.join(config_root, continue_from["config"]))
    if old_config["engine"] != new_config["engine"]:
        err_str.append(
            'parameter engine, engines for old model and new model not match')
        return err_str
    # check whether the path to continue from exists
    model_dir = get_model_dir(continue_from["trainset"],
                              continue_from["config"])
    if engine == 'tesseract':
        model_path = os.path.join(model_root, model_dir, 'checkpoint',
                                  continue_from["model"])
    else:
        model_path = os.path.join(model_root, model_dir,
                                  continue_from["model"])

    if engine == 'calamari':
        model_path += '.json'
    if not os.path.exists(model_path):
        err_str.append('parameter continue_from, model does not exist')
    # check whether the model structure is right
    if "model" in new_config:
        if engine == 'calamari':
            if new_config['model'] != old_config["model"]:
                err_str.append(
                    'parameter model, old model and new model must match for calamari.'
                )
                return err_str
        elif engine == 'ocropus':
            err_str.append(
                'parameters model, ocropus does not support new model structure for fine tuning.'
            )
            return err_str
        if "append" in new_config:
            append_index = new_config["append"]
            if "append" not in new_config or append_index < 1:
                err_str.append(
                    'parameter append, please assign a valid append')
            new_model = new_config["model"]
            old_model = old_config["model"]
            len_old_model = len(
                old_model) if "input" in old_model[0] else len(old_model) + 1
            len_old_model = len_old_model - 1 if "output" in old_model[
                -1] else len_old_model
            if append_index >= len_old_model:
                err_str.append(
                    'parameter append, append_index must be less than the number of layers (excluding output layer, including input layer).'
                )
            concat_model = old_model[:append_index] if "input" in old_model[
                0] else old_model[:append_index - 1]
            concat_model += new_model
    else:
        if "append" in new_config:
            err_str.append(
                'parameter append, please specify the model structure to append.'
            )
        if engine == 'calamari':
            if "model" in old_config:
                err_str.append(
                    'parameter model, old model and new model must match for calamari'
                )
    return err_str
示例#19
0
 def __init__(self, model, engine):
     self.model = model
     self.engine = engine
     self.translator = read_json(
         "engines/schemas/models/translate_model.json")