Пример #1
0
    def train_books(self, books, output_model_prefix, weights=None, train_to_val=1,
                    max_iters=100000, display=500, checkpoint_frequency=-1, preload=False):
        if isinstance(books, str):
            books = [books]
        dset = Nash5DataSet(DataSetMode.TRAIN, self.cachefile, books)
        if 0 < train_to_val < 1:
            valsamples = random.sample(dset._samples,
                                       int((1-train_to_val)*len(dset)))
            for s in valsamples:
                dset._samples.remove(s)
            vdset = Nash5DataSet(DataSetMode.TRAIN, self.cachefile, [])
            vdset._samples = valsamples
        else:
            vdset = None

        parser = argparse.ArgumentParser()
        setup_train_args(parser, omit=["files", "validation"])
        args = parser.parse_known_args()[0]
        with h5py.File(self.cachefile, 'r', libver='latest', swmr=True) as cache:
            if all(cache[b].attrs.get("dir") == "rtl" for b in books):
                args.bidi_dir = "rtl"
        params = params_from_args(args)
        params.output_model_prefix = output_model_prefix
        params.early_stopping_best_model_prefix = "best_" + output_model_prefix
        params.max_iters = max_iters
        params.display = display
        params.checkpoint_frequency = checkpoint_frequency

        trainer = Trainer(params, dset, txt_preproc=NoopTextProcessor(), data_preproc=NoopDataPreprocessor(),
                  validation_dataset=vdset, weights=weights, preload_training=preload, preload_validation=True)

        trainer.train(progress_bar=True)
Пример #2
0
def main(args=None):
    if args is None:
        # parse args from command line
        parser = argparse.ArgumentParser()

        # fold parameters
        parser.add_argument("--files", nargs="+",
                            help="List all image files that shall be processed. Ground truth fils with the same "
                                 "base name but with '.gt.txt' as extension are required at the same location")
        parser.add_argument("--n_folds", type=int, default=5,
                            help="The number of fold, that is the number of models to train")
        parser.add_argument("--keep_temporary_files", action="store_true",
                            help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this "
                                 "flag if you want to keep those files.")
        parser.add_argument("--best_models_dir", type=str, required=True,
                            help="path where to store the best models of each fold")
        parser.add_argument("--best_model_label", type=str, default="{id}",
                            help="The label of the best model in best model dirs. This will be string formatted. "
                                 "The default '{id}' will label the models 0, 1, 2, 3, ...")
        parser.add_argument("--temporary_dir", type=str, default=None,
                            help="A path to a temporary dir, where the intermediate model training data will be stored"
                                 "for each fold. Use --keep_temporary_files flag to keep the files. By default a system"
                                 "temporary dir will be used")
        parser.add_argument("--run", type=str, default=None,
                            help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                                 "manager such as slurm.")
        parser.add_argument("--max_parallel_models", type=int, default=-1,
                            help="Number of models to train in parallel. Defaults to all.")
        parser.add_argument("--weights", type=str, nargs="+", default=[],
                            help="Load network weights from the given file. If more than one file is provided the number "
                                 "models must match the number of folds. Each fold is then initialized with the weights "
                                 "of each model, respectively. If a model path is set to 'None', this model will start "
                                 "from scratch")
        parser.add_argument("--single_fold", type=int, nargs="+", default=[],
                            help="Only train a single (list of single) specific fold(s).")

        # add the training args (omit those params, that are set by the cross fold training)
        setup_train_args(parser, omit=["files", "validation", "weights",
                                       "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix"])

        args = parser.parse_args()

    # argument checks
    if len(args.weights) > 1 and len(args.weights) != args.n_folds:
        raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format(
            args.n_folds, len(args.weights)
        ))

    if len(args.single_fold) > 0:
        if len(set(args.single_fold)) != len(args.single_fold):
            raise Exception("Repeated fold id's found.")
        for fold_id in args.single_fold:
            if fold_id < 0 or fold_id >= args.n_folds:
                raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id))

    # automatically set the number of models that shall be run in parallel
    if args.max_parallel_models <= 0:
        args.max_parallel_models = args.n_folds

    # by default, the temporary files will be deleted after a successful training
    # if you specify a temporary dir, you can easily resume to train if an error occurred
    if args.keep_temporary_files and not args.temporary_dir:
        raise Exception("If you want to keep the temporary model files you have to specify a temporary dir")

    if not args.temporary_dir:
        args.temporary_dir = tempfile.mkdtemp(prefix="calamari")
    else:
        args.temporary_dir = os.path.abspath(args.temporary_dir)
        if not os.path.exists(args.temporary_dir):
            os.makedirs(args.temporary_dir)

    # location of best models output
    if not os.path.exists(args.best_models_dir):
        os.makedirs(args.best_models_dir)

    # locate the training script (must be in the same dir as "this")
    train_script_path = os.path.join(this_absdir, "train.py")

    if not os.path.exists(train_script_path):
        raise Exception("Missing train script path. Expected 'train.py' at {}".format(this_absdir))

    # Compute the files in the cross fold (create a CrossFold)
    fold_file = os.path.join(args.temporary_dir, "folds.json")
    cross_fold = CrossFold(n_folds=args.n_folds, source_files=args.files, output_dir=args.best_models_dir)
    cross_fold.write_folds_to_json(fold_file)

    # Create the json argument file for each individual training
    run_args = []
    folds_to_run = args.single_fold if len(args.single_fold) > 0 else range(len(cross_fold.folds))
    for fold in folds_to_run:
        train_files = cross_fold.train_files(fold)
        test_files = cross_fold.test_files(fold)
        path = os.path.join(args.temporary_dir, "fold_{}.json".format(fold))
        with open(path, 'w') as f:
            fold_args = vars(args).copy()
            fold_args["id"] = fold
            fold_args["files"] = train_files
            fold_args["validation"] = test_files
            fold_args["train_script"] = train_script_path
            fold_args["verbose"] = True
            fold_args["output_dir"] = os.path.join(args.temporary_dir, "fold_{}".format(fold))
            fold_args["early_stopping_best_model_output_dir"] = args.best_models_dir
            fold_args["early_stopping_best_model_prefix"] = args.best_model_label.format(id=fold)

            if args.seed >= 0:
                fold_args["seed"] = args.seed + fold

            if len(args.weights) == 1:
                fold_args["weights"] = args.weights[0]
            elif len(args.weights) > 1:
                fold_args["weights"] = args.weights[fold]
            else:
                fold_args["weights"] = None

            # start from scratch via None
            if fold_args["weights"]:
                if len(fold_args["weights"].strip()) == 0 or fold_args["weights"].upper() == "NONE":
                    fold_args["weights"] = None

            json.dump(
                fold_args,
                f,
                indent=4,
            )

        run_args.append({"json": path, "args": fold_args})

    # Launch the individual processes for each training
    with multiprocessing.Pool(processes=args.max_parallel_models) as pool:
        # workaround to forward keyboard interrupt
        pool.map_async(train_individual_model, run_args).get(999999999)
Пример #3
0
def main(args=None):
    if args is None:
        # parse args from command line
        parser = argparse.ArgumentParser()

        # fold parameters
        parser.add_argument("--files", nargs="+",
                            help="List all image files that shall be processed. Ground truth fils with the same "
                                 "base name but with '.gt.txt' as extension are required at the same location. "
                                 "Optionally you can pass a single json file defining all arguments")
        parser.add_argument("--dataset", type=DataSetType.from_string, choices=list(DataSetType), default=DataSetType.FILE)
        parser.add_argument("--text_files", nargs="+", default=None,
                            help="Optional list of GT files if they are in other directory")
        parser.add_argument("--gt_extension", default=None,
                            help="Default extension of the gt files (expected to exist in same dir)")

        parser.add_argument("--n_folds", type=int, default=5,
                            help="The number of fold, that is the number of models to train")
        parser.add_argument("--keep_temporary_files", action="store_true",
                            help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this "
                                 "flag if you want to keep those files.")
        parser.add_argument("--best_models_dir", type=str, required=True,
                            help="path where to store the best models of each fold")
        parser.add_argument("--best_model_label", type=str, default="{id}",
                            help="The label of the best model in best model dirs. This will be string formatted. "
                                 "The default '{id}' will label the models 0, 1, 2, 3, ...")
        parser.add_argument("--temporary_dir", type=str, default=None,
                            help="A path to a temporary dir, where the intermediate model training data will be stored"
                                 "for each fold. Use --keep_temporary_files flag to keep the files. By default a system"
                                 "temporary dir will be used")
        parser.add_argument("--run", type=str, default=None,
                            help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                                 "manager such as slurm.")
        parser.add_argument("--max_parallel_models", type=int, default=-1,
                            help="Number of models to train in parallel. Defaults to all.")
        parser.add_argument("--weights", type=str, nargs="+", default=[],
                            help="Load network weights from the given file. If more than one file is provided the number "
                                 "models must match the number of folds. Each fold is then initialized with the weights "
                                 "of each model, respectively. If a model path is set to 'None', this model will start "
                                 "from scratch")
        parser.add_argument("--single_fold", type=int, nargs="+", default=[],
                            help="Only train a single (list of single) specific fold(s).")

        # add the training args (omit those params, that are set by the cross fold training)
        setup_train_args(parser, omit=["files", "validation", "weights",
                                       "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix",
                                       "output_dir"])

        args = parser.parse_args()

    # check if loading a json file
    if len(args.files) == 1 and args.files[0].endswith("json"):
        with open(args.files[0], 'r') as f:
            json_args = json.load(f)
            for key, value in json_args.items():
                if key == 'dataset' or key == 'validation_dataset':
                    setattr(args, key, DataSetType.from_string(value))
                else:
                    setattr(args, key, value)

    dataset_args = FileDataReaderArgs(
        line_generator_params=args.line_generator_params,
        text_generator_params=args.text_generator_params,
        pad=args.dataset_pad,
        text_index=args.pagexml_text_index,
    )
    train_params = PipelineParams(
        type=args.dataset,
        skip_invalid=not args.no_skip_invalid_gt,
        remove_invalid=True,
        files=args.files,
        text_files=args.text_files,
        gt_extension=args.gt_extension if args.gt_extension else DataSetType.gt_extension(args.dataset),
        data_reader_args=dataset_args,
        batch_size=args.batch_size,
        num_processes=args.num_threads,
    )
    reader = data_reader_from_params(PipelineMode.Training, train_params)

    trainer = CrossFoldTrainer(
        n_folds=args.n_folds,
        data_reader=reader,
        best_models_dir=args.best_models_dir,
        best_model_label=args.best_model_label,
        train_args=vars(args),
        progress_bars=not args.no_progress_bars,
    )
    trainer.run(
        args.single_fold, seed=args.seed, weights=args.weights, max_parallel_models=args.max_parallel_models,
        temporary_dir=args.temporary_dir, keep_temporary_files=args.keep_temporary_files,
    )
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_dir", type=str, required=True,
                        help="The base directory where to store all working files")
    parser.add_argument("--eval_files", type=str, nargs="+", required=True,
                        help="All files that shall be used for evaluation")
    parser.add_argument("--train_files", type=str, nargs="+", required=True,
                        help="All files that shall be used for (cross-fold) training")
    parser.add_argument("--n_lines", type=int, default=[-1], nargs="+",
                        help="Optional argument to specify the number of lines (images) used for training. "
                             "On default, all available lines will be used.")
    parser.add_argument("--run", type=str, default=None,
                        help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                             "manager such as slurm.")

    parser.add_argument("--n_folds", type=int, default=5,
                        help="The number of fold, that is the number of models to train")
    parser.add_argument("--max_parallel_models", type=int, default=-1,
                        help="Number of models to train in parallel per fold. Defaults to all.")
    parser.add_argument("--weights", type=str, nargs="+", default=[],
                        help="Load network weights from the given file. If more than one file is provided the number "
                             "models must match the number of folds. Each fold is then initialized with the weights "
                             "of each model, respectively")
    parser.add_argument("--single_fold", type=int, nargs="+", default=[],
                        help="Only train a single (list of single) specific fold(s).")
    parser.add_argument("--skip_train", action="store_true",
                        help="Skip the cross fold training")
    parser.add_argument("--skip_eval", action="store_true",
                        help="Skip the cross fold evaluation")
    parser.add_argument("--verbose", action="store_true",
                        help="Verbose output")
    parser.add_argument("--n_confusions", type=int, default=0,
                        help="Only print n most common confusions. Defaults to 0, use -1 for all.")
    parser.add_argument("--xlsx_output", type=str,
                        help="Optionally write a xlsx file with the evaluation results")

    setup_train_args(parser, omit=["files", "validation", "weights",
                                   "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix",
                                   "output_dir"])

    args = parser.parse_args()

    args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir))

    np.random.seed(args.seed)
    random.seed(args.seed)

    # argument checks
    args.weights = glob_all(args.weights)
    if len(args.weights) > 1 and len(args.weights) != args.n_folds:
        raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format(
            args.n_folds, len(args.weights)
        ))

    if len(args.single_fold) > 0:
        if len(set(args.single_fold)) != len(args.single_fold):
            raise Exception("Repeated fold id's found.")
        for fold_id in args.single_fold:
            if fold_id < 0 or fold_id >= args.n_folds:
                raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id))

        actual_folds = args.single_fold
    else:
        actual_folds = list(range(args.n_folds))

    # run for all lines
    single_args = [copy.copy(args) for _ in args.n_lines]
    for s_args, n_lines in zip(single_args, args.n_lines):
        s_args.n_lines = n_lines

    predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True)

    # output predictions as csv:
    header = "lines," + ",".join([str(fold) for fold in range(args.n_folds)])\
             + ",avg,std,seq. vot., def. conf. vot., fuz. conf. vot."

    print(header)

    for prediction_map, n_lines in zip(predictions, args.n_lines):
        prediction = prediction_map["full"]
        data = "{}".format(n_lines)
        folds_lers = []
        for fold in range(len(actual_folds)):
            eval = prediction[str(fold)]["eval"]
            data += ",{}".format(eval['avg_ler'])
            folds_lers.append(eval['avg_ler'])

        data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers))
        for voter in ['sequence_voter', 'confidence_voter_default_ctc']:
            eval = prediction[voter]["eval"]
            data += ",{}".format(eval['avg_ler'])

        print(data)

    if args.n_confusions != 0:
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            print("")
            print("CONFUSIONS (lines = {})".format(n_lines))
            print("==========")
            print()

            for fold in range(len(actual_folds)):
                print("FOLD {}".format(fold))
                print_confusions(prediction[str(fold)]['eval'], args.n_confusions)

            for voter in ['sequence_voter', 'confidence_voter_default_ctc']:
                print("VOTER {}".format(voter))
                print_confusions(prediction[voter]['eval'], args.n_confusions)

    if args.xlsx_output:
        data_list = []
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            for fold in actual_folds:
                pred = prediction[str(fold)]
                data_list.append({
                    "prefix": "L{} - Fold{}".format(n_lines, fold),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

            for voter in ['sequence_voter', 'confidence_voter_default_ctc']:
                pred = prediction[voter]
                data_list.append({
                    "prefix": "L{} - {}".format(n_lines, voter[:3]),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

        write_xlsx(args.xlsx_output, data_list)
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_dir", type=str, required=True,
                        help="The base directory where to store all working files")
    parser.add_argument("--eval_files", type=str, nargs="+", required=True,
                        help="All files that shall be used for evaluation")
    parser.add_argument("--train_files", type=str, nargs="+", required=True,
                        help="All files that shall be used for (cross-fold) training")
    parser.add_argument("--n_lines", type=int, default=[-1], nargs="+",
                        help="Optional argument to specify the number of lines (images) used for training. "
                             "On default, all available lines will be used.")
    parser.add_argument("--run", type=str, default=None,
                        help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                             "manager such as slurm.")

    parser.add_argument("--n_folds", type=int, default=5,
                        help="The number of fold, that is the number of models to train")
    parser.add_argument("--max_parallel_models", type=int, default=-1,
                        help="Number of models to train in parallel per fold. Defaults to all.")
    parser.add_argument("--weights", type=str, nargs="+", default=[],
                        help="Load network weights from the given file. If more than one file is provided the number "
                             "models must match the number of folds. Each fold is then initialized with the weights "
                             "of each model, respectively")
    parser.add_argument("--single_fold", type=int, nargs="+", default=[],
                        help="Only train a single (list of single) specific fold(s).")
    parser.add_argument("--skip_train", action="store_true",
                        help="Skip the cross fold training")
    parser.add_argument("--skip_eval", action="store_true",
                        help="Skip the cross fold evaluation")
    parser.add_argument("--verbose", action="store_true",
                        help="Verbose output")
    parser.add_argument("--n_confusions", type=int, default=0,
                        help="Only print n most common confusions. Defaults to 0, use -1 for all.")
    parser.add_argument("--xlsx_output", type=str,
                        help="Optionally write a xlsx file with the evaluation results")

    setup_train_args(parser, omit=["files", "validation", "weights",
                                   "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix",
                                   "output_dir"])

    args = parser.parse_args()

    args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir))

    np.random.seed(args.seed)
    random.seed(args.seed)

    # argument checks
    args.weights = glob_all(args.weights)
    if len(args.weights) > 1 and len(args.weights) != args.n_folds:
        raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format(
            args.n_folds, len(args.weights)
        ))

    if len(args.single_fold) > 0:
        if len(set(args.single_fold)) != len(args.single_fold):
            raise Exception("Repeated fold id's found.")
        for fold_id in args.single_fold:
            if fold_id < 0 or fold_id >= args.n_folds:
                raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id))

        actual_folds = args.single_fold
    else:
        actual_folds = list(range(args.n_folds))

    # run for all lines
    single_args = [copy.copy(args) for _ in args.n_lines]
    for s_args, n_lines in zip(single_args, args.n_lines):
        s_args.n_lines = n_lines

    predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True)

    # output predictions as csv:
    header = "lines," + ",".join([str(fold) for fold in range(args.n_folds)])\
             + ",avg,std,seq. vot., def. conf. vot., fuz. conf. vot."

    print(header)

    for prediction_map, n_lines in zip(predictions, args.n_lines):
        prediction = prediction_map["full"]
        data = "{}".format(n_lines)
        folds_lers = []
        for fold in range(len(actual_folds)):
            eval = prediction[str(fold)]["eval"]
            data += ",{}".format(eval['avg_ler'])
            folds_lers.append(eval['avg_ler'])

        data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers))
        for voter in ['sequence_voter', 'confidence_voter_default_ctc', 'confidence_voter_fuzzy_ctc']:
            eval = prediction[voter]["eval"]
            data += ",{}".format(eval['avg_ler'])

        print(data)

    if args.n_confusions != 0:
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            print("")
            print("CONFUSIONS (lines = {})".format(n_lines))
            print("==========")
            print()

            for fold in range(len(actual_folds)):
                print("FOLD {}".format(fold))
                print_confusions(prediction[str(fold)]['eval'], args.n_confusions)

            for voter in ['sequence_voter', 'confidence_voter_default_ctc', 'confidence_voter_fuzzy_ctc']:
                print("VOTER {}".format(voter))
                print_confusions(prediction[voter]['eval'], args.n_confusions)

    if args.xlsx_output:
        data_list = []
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            for fold in actual_folds:
                pred = prediction[str(fold)]
                data_list.append({
                    "prefix": "L{} - Fold{}".format(n_lines, fold),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

            for voter in ['sequence_voter', 'confidence_voter_default_ctc']:
                pred = prediction[voter]
                data_list.append({
                    "prefix": "L{} - {}".format(n_lines, voter[:3]),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

        write_xlsx(args.xlsx_output, data_list)
Пример #6
0
def main(args=None):
    if args is None:
        # parse args from command line
        parser = argparse.ArgumentParser()

        # fold parameters
        parser.add_argument("--files", nargs="+",
                            help="List all image files that shall be processed. Ground truth fils with the same "
                                 "base name but with '.gt.txt' as extension are required at the same location")
        parser.add_argument("--n_folds", type=int, default=5,
                            help="The number of fold, that is the number of models to train")
        parser.add_argument("--keep_temporary_files", action="store_true",
                            help="By default all temporary files (e.g. intermediate checkpoints) will be erased. Set this "
                                 "flag if you want to keep those files.")
        parser.add_argument("--best_models_dir", type=str, required=True,
                            help="path where to store the best models of each fold")
        parser.add_argument("--best_model_label", type=str, default="{id}",
                            help="The label of the best model in best model dirs. This will be string formatted. "
                                 "The default '{id}' will label the models 0, 1, 2, 3, ...")
        parser.add_argument("--temporary_dir", type=str, default=None,
                            help="A path to a temporary dir, where the intermediate model training data will be stored"
                                 "for each fold. Use --keep_temporary_files flag to keep the files. By default a system"
                                 "temporary dir will be used")
        parser.add_argument("--run", type=str, default=None,
                            help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                                 "manager such as slurm.")
        parser.add_argument("--max_parallel_models", type=int, default=-1,
                            help="Number of models to train in parallel. Defaults to all.")
        parser.add_argument("--weights", type=str, nargs="+", default=[],
                            help="Load network weights from the given file. If more than one file is provided the number "
                                 "models must match the number of folds. Each fold is then initialized with the weights "
                                 "of each model, respectively. If a model path is set to 'None', this model will start "
                                 "from scratch")
        parser.add_argument("--single_fold", type=int, nargs="+", default=[],
                            help="Only train a single (list of single) specific fold(s).")

        # add the training args (omit those params, that are set by the cross fold training)
        setup_train_args(parser, omit=["files", "validation", "weights",
                                       "early_stopping_best_model_output_dir", "early_stopping_best_model_prefix",
                                       "output_dir"])

        args = parser.parse_args()

    # argument checks
    if len(args.weights) > 1 and len(args.weights) != args.n_folds:
        raise Exception("Either no, one or n_folds (={}) models are required for pretraining but got {}.".format(
            args.n_folds, len(args.weights)
        ))

    if len(args.single_fold) > 0:
        if len(set(args.single_fold)) != len(args.single_fold):
            raise Exception("Repeated fold id's found.")
        for fold_id in args.single_fold:
            if fold_id < 0 or fold_id >= args.n_folds:
                raise Exception("Invalid fold id found: 0 <= id <= {}, but id == {}".format(args.n_folds, fold_id))

    # automatically set the number of models that shall be run in parallel
    if args.max_parallel_models <= 0:
        args.max_parallel_models = args.n_folds

    # by default, the temporary files will be deleted after a successful training
    # if you specify a temporary dir, you can easily resume to train if an error occurred
    if args.keep_temporary_files and not args.temporary_dir:
        raise Exception("If you want to keep the temporary model files you have to specify a temporary dir")

    if not args.temporary_dir:
        args.temporary_dir = tempfile.mkdtemp(prefix="calamari")
    else:
        args.temporary_dir = os.path.abspath(args.temporary_dir)
        if not os.path.exists(args.temporary_dir):
            os.makedirs(args.temporary_dir)

    # location of best models output
    if not os.path.exists(args.best_models_dir):
        os.makedirs(args.best_models_dir)

    # locate the training script (must be in the same dir as "this")
    train_script_path = os.path.join(this_absdir, "train.py")

    if not os.path.exists(train_script_path):
        raise Exception("Missing train script path. Expected 'train.py' at {}".format(this_absdir))

    # Compute the files in the cross fold (create a CrossFold)
    fold_file = os.path.join(args.temporary_dir, "folds.json")
    cross_fold = CrossFold(n_folds=args.n_folds, source_files=args.files, output_dir=args.best_models_dir)
    cross_fold.write_folds_to_json(fold_file)

    # Create the json argument file for each individual training
    run_args = []
    folds_to_run = args.single_fold if len(args.single_fold) > 0 else range(len(cross_fold.folds))
    for fold in folds_to_run:
        train_files = cross_fold.train_files(fold)
        test_files = cross_fold.test_files(fold)
        path = os.path.join(args.temporary_dir, "fold_{}.json".format(fold))
        with open(path, 'w') as f:
            fold_args = vars(args).copy()
            fold_args["id"] = fold
            fold_args["files"] = train_files
            fold_args["validation"] = test_files
            fold_args["train_script"] = train_script_path
            fold_args["verbose"] = True
            fold_args["output_dir"] = os.path.join(args.temporary_dir, "fold_{}".format(fold))
            fold_args["early_stopping_best_model_output_dir"] = args.best_models_dir
            fold_args["early_stopping_best_model_prefix"] = args.best_model_label.format(id=fold)

            if args.seed >= 0:
                fold_args["seed"] = args.seed + fold

            if len(args.weights) == 1:
                fold_args["weights"] = args.weights[0]
            elif len(args.weights) > 1:
                fold_args["weights"] = args.weights[fold]
            else:
                fold_args["weights"] = None

            # start from scratch via None
            if fold_args["weights"]:
                if len(fold_args["weights"].strip()) == 0 or fold_args["weights"].upper() == "NONE":
                    fold_args["weights"] = None

            json.dump(
                fold_args,
                f,
                indent=4,
            )

        run_args.append({"json": path, "args": fold_args})

    # Launch the individual processes for each training
    with multiprocessing.Pool(processes=args.max_parallel_models) as pool:
        # workaround to forward keyboard interrupt
        pool.map_async(train_individual_model, run_args).get(999999999)
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_dir", type=str, required=True,
                        help="The base directory where to store all working files")
    parser.add_argument("--eval_files", type=str, nargs="+", required=True,
                        help="All files that shall be used for evaluation")
    parser.add_argument("--n_lines", type=int, default=[-1], nargs="+",
                        help="Optional argument to specify the number of lines (images) used for training. "
                             "On default, all available lines will be used.")
    parser.add_argument("--run", type=str, default=None,
                        help="An optional command that will receive the train calls. Useful e.g. when using a resource "
                             "manager such as slurm.")

    parser.add_argument("--skip_train", action="store_true",
                        help="Skip the cross fold training")
    parser.add_argument("--skip_eval", action="store_true",
                        help="Skip the cross fold evaluation")
    parser.add_argument("--verbose", action="store_true",
                        help="Verbose output")
    parser.add_argument("--n_confusions", type=int, default=0,
                        help="Only print n most common confusions. Defaults to 0, use -1 for all.")
    parser.add_argument("--xlsx_output", type=str,
                        help="Optionally write a xlsx file with the evaluation results")

    setup_train_args(parser, omit=["early_stopping_best_model_output_dir", "output_dir"])

    args = parser.parse_args()

    args.base_dir = os.path.abspath(os.path.expanduser(args.base_dir))

    np.random.seed(args.seed)
    random.seed(args.seed)

    # run for all lines
    single_args = [copy.copy(args) for _ in args.n_lines]
    for s_args, n_lines in zip(single_args, args.n_lines):
        s_args.n_lines = n_lines

    predictions = parallel_map(run_for_single_line, single_args, progress_bar=False, processes=len(single_args), use_thread_pool=True)
    predictions = list(predictions)


    # output predictions as csv:
    header = "lines," + ",".join([str(fold) for fold in range(len(predictions[0]["full"]) - 1)])\
             + ",avg,std,voted"

    print(header)

    for prediction_map, n_lines in zip(predictions, args.n_lines):
        prediction = prediction_map["full"]
        data = "{}".format(n_lines)
        folds_lers = []
        for fold, pred in prediction.items():
            if fold == 'voted':
                continue

            eval = pred["eval"]
            data += ",{}".format(eval['avg_ler'])
            folds_lers.append(eval['avg_ler'])

        data += ",{},{}".format(np.mean(folds_lers), np.std(folds_lers))
        eval = prediction['voted']["eval"]
        data += ",{}".format(eval['avg_ler'])

        print(data)

    if args.n_confusions != 0:
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            print("")
            print("CONFUSIONS (lines = {})".format(n_lines))
            print("==========")
            print()

            for fold, pred in prediction.items():
                print("FOLD {}".format(fold))
                print_confusions(pred['eval'], args.n_confusions)

    if args.xlsx_output:
        data_list = []
        for prediction_map, n_lines in zip(predictions, args.n_lines):
            prediction = prediction_map["full"]
            for fold, pred in prediction.items():
                data_list.append({
                    "prefix": "L{} - Fold{}".format(n_lines, fold),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

            for voter in ['sequence_voter', 'confidence_voter_default_ctc']:
                pred = prediction[voter]
                data_list.append({
                    "prefix": "L{} - {}".format(n_lines, voter[:3]),
                    "results": pred['eval'],
                    "gt_files": prediction_map['gt_txts'],
                    "gts": prediction_map['gt'],
                    "preds": pred['data']
                })

        write_xlsx(args.xlsx_output, data_list)