def _load_process(self, executor, main_prog):
        """
        读取模型的过程,
        如果想从零开始,请将load_model_path设为空字符串,且read_checkpoint,continue_train为false
        如果想从预训练模型(或某个基线)开始训练,请设置continue_train为False,
        如果想继续训练,请设置请设置continue_train为True,
        如果想从断点训练,请设置read_checkpoint为true。
        :param executor:
        :param main_prog:
        :return: 字典,保存当前训练状态, 将保存在self.train_status中
        """
        CONTINUE = self.args["continue_train"]
        MODEL_PATH = self.args["load_model_path"]
        CHECK_POINT = self.args["read_checkpoint"]

        total_step = 0
        step_in_epoch = 0
        total_epoch = 1
        # 读取模型现有的参数并为继续训练进行相应处理
        if CONTINUE and CHECK_POINT:
            info = model_utils.load_train_snapshot(executor, main_prog, MODEL_PATH)
            self.logger.info("Model file in {} has been loaded".format(MODEL_PATH))
            if info:
                total_step = info.get("total_step", 0)
                step_in_epoch = info.get("step_in_epoch", 0)
                total_epoch = info.get("epoch", 1)
                self.logger.info("Load train info: {}".format(info))
        elif MODEL_PATH != "":
            # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数
            model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH)
            self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH))

        return {'total_step': total_step, 'total_epoch': total_epoch, 'step_in_epoch': step_in_epoch}
    def init_model(self):
        """
        根据模型参数路径读入模型来初始化,包括预测程序编译,模型参数赋值,并行策略
        :param vocab_size: 词典大小
        :return:
        """
        model_path = self.args["load_model_path"]
        self.logger.info("Initializing predict model...")
        self.exe = fluid.Executor(
            TrainEngine.get_executor_run_places(self.args))
        with fluid.program_guard(self.predict_program, self.predict_startup):
            # 根据gzl的模型来定义网络,输出占位符
            loader, probs, qas_id = classifier.create_model_for_cls_merge(
                args=self.args_model_build, is_prediction=True)
            self.logger.info("Prediction neural network created.")

        self.logger.info("Prediction neural network parameter initialized.")

        # start_up程序运行初始参数
        self.exe.run(self.predict_startup)

        # 加载模型参数到网络中
        load_model_params(self.exe, model_path, self.predict_program)

        # 若并行,用并行编译program
        if self.args["use_parallel"]:
            build_strategy = fluid.BuildStrategy()
            # 并行策略暂时写死
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
            self.predict_program = fluid.CompiledProgram(self.predict_program). \
                with_data_parallel(places=TrainEngine.get_data_run_places(self.args),
                                   build_strategy=build_strategy)

        self.logger.info("Finish initializing predict model!")
        return loader, probs, qas_id
    def _load_process(self, executor, main_prog):
        """
        读取模型的过程
        :param executor:
        :param main_prog:
        :return: 字典,保存当前训练状态, 将保存在self.train_status中
        """
        MODEL_PATH = self.args["load_model_path"]

        model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH)
        self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH))
예제 #4
0
    def _load_process(self, executor, main_prog):
        """
        读取模型的过程,
        如果想从零开始,请将load_model_path设为空字符串,且read_checkpoint,continue_train为false
        如果想从预训练模型(或某个基线)开始训练,请设置continue_train为False,
        如果想继续训练,请设置请设置continue_train为True,
        如果想从断点训练,请设置read_checkpoint为true。
        :param executor:
        :param main_prog:
        :return: 字典,保存当前训练状态, 将保存在self.train_status中
        """
        MODEL_PATH = self.args["load_model_path"]

        model_utils.load_model_params(exe=executor, program=main_prog, params_path=MODEL_PATH)
        self.logger.info("Pre-trained model file in {} has been loaded".format(MODEL_PATH))
        return
    def train(self):
        """
        用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证
        :return: 无
        """
        APP_NAME = self.args["app_name"]
        MAX_EPOCH = self.args["max_epoch"]
        SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"]

        EARLY_STOPPING = self.args["early_stopping"]
        if EARLY_STOPPING:
            THRESHOLD = self.args["early_stopping_threshold"]
            STANDSTILL_STEP = self.args["early_stopping_stand_times"]

        CONTINUE = self.args["continue_train"]
        MODEL_PATH = self.args["load_model_path"]
        CHECK_POINT = self.args["read_checkpoint"]

        # 定义执行器
        executor = fluid.Executor(self.get_executor_run_places(self.args))
        # 执行初始化
        executor.run(self.train_startup_prog)

        total_step = 0
        step_in_epoch = 0
        total_epoch = 0
        # 读取模型现有的参数并为继续训练进行相应处理
        if MODEL_PATH != "":
            # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数
            model_utils.load_model_params(exe=executor,
                                          program=self.origin_train_prog,
                                          params_path=MODEL_PATH)
            self.logger.info(
                "Pre-trained model file in {} has been loaded".format(
                    MODEL_PATH))

        self.logger.info("Ready to train the model.Executing...")
        self.__run_train_iterable(executor, total_step, 0, 0)
예제 #6
0
    def train(self):
        """
        用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证
        :return: 无
        """
        APP_NAME = self.args["app_name"]
        MAX_EPOCH = self.args["max_epoch"]
        SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"]

        EARLY_STOPPING = self.args["early_stopping"]
        if EARLY_STOPPING:
            THRESHOLD = self.args["early_stopping_threshold"]
            STANDSTILL_STEP = self.args["early_stopping_stand_times"]

        CONTINUE = self.args["continue_train"]
        if CONTINUE:
            MODEL_PATH = self.args["load_model_path"]

        PRETRAIN_MODEL = self.args["pretrained_model_path"]

        # 定义执行器
        executor = fluid.Executor(self.get_executor_run_places(self.args))
        # 执行初始化
        executor.run(self.train_startup_prog)

        total_step = 0
        step_in_epoch = 0
        total_epoch = 0
        # 读取模型现有的参数并为继续训练进行相应处理
        if CONTINUE:
            info = model_utils.load_train_snapshot(executor,
                                                   self.origin_train_prog,
                                                   MODEL_PATH)
            self.logger.info(
                "Model file in {} has been loaded".format(MODEL_PATH))
            if info:
                total_step == info.get("total_step", 0)
                step_in_epoch = info.get("step_in_epoch", 0)
                total_epoch = info.get("epoch", 0)
        elif PRETRAIN_MODEL != "":
            # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数
            model_utils.load_model_params(exe=executor,
                                          program=self.origin_train_prog,
                                          params_path=PRETRAIN_MODEL)
            self.logger.info(
                "Pre-trained model file in {} has been loaded".format(
                    PRETRAIN_MODEL))

        self.logger.info("Ready to train the model.Executing...")

        # 执行MAX_EPOCH次迭代save_train_snapshot
        for epoch_id in range(MAX_EPOCH):
            if epoch_id == 0:
                epoch_id += total_epoch
            # 一个epoch的训练过程,一个迭代
            total_step, loss = self.__run_train_iterable(
                executor, total_step, epoch_id, step_in_epoch)
            step_in_epoch = 0
            self.logger.info(
                'Epoch {epoch} done, train mean loss is {loss}'.format(
                    epoch=epoch_id, loss=loss))
            # 进行一次验证集上的验证

            info = {"total_step": total_step, "epoch": epoch_id}
            file_path = model_utils.save_train_snapshot(
                executor,
                self.origin_train_prog,
                file_name="{}_epoch{}".format(APP_NAME, epoch_id),
                train_info=info)
            self.logger.info(
                "Snapshot of training process has been saved as folder {}".
                format(file_path))
        # 保存现有模型
        file_path = model_utils.save_train_snapshot(executor,
                                                    self.origin_train_prog,
                                                    APP_NAME)
        self.logger.info(
            "Training process completed. model saved in {}".format(file_path))
    def train(self):
        """
        用于训练流程,根据参数完成训练,并使用验证数据对模型效果进行验证
        :return: 无
        """
        APP_NAME = self.args["app_name"]
        MAX_EPOCH = self.args["max_epoch"]
        SNAPSHOT_FREQUENCY = self.args["snapshot_frequency"]

        EARLY_STOPPING = self.args["early_stopping"]
        if EARLY_STOPPING:
            THRESHOLD = self.args["early_stopping_threshold"]
            STANDSTILL_STEP = self.args["early_stopping_stand_times"]

        CONTINUE = self.args["continue_train"]
        MODEL_PATH = self.args["load_model_path"]
        CHECK_POINT = self.args["read_checkpoint"]

        # 定义执行器
        executor = fluid.Executor(self.get_executor_run_places(self.args))
        # 执行初始化
        executor.run(self.train_startup_prog)

        total_step = 0
        step_in_epoch = 0
        total_epoch = 0
        # 读取模型现有的参数并为继续训练进行相应处理
        if CONTINUE and CHECK_POINT:
            info = model_utils.load_train_snapshot(executor,
                                                   self.origin_train_prog,
                                                   MODEL_PATH)
            self.logger.info(
                "Model file in {} has been loaded".format(MODEL_PATH))
            if info:
                total_step = info.get("total_step", 0)
                step_in_epoch = info.get("step_in_epoch", 0)
                total_epoch = info.get("epoch", 0)
                self.logger.info("Load train info: {}".format(info))
        elif MODEL_PATH != "":
            # 若是第一次训练且预训练模型参数不为空,则加载预训练模型参数
            model_utils.load_model_params(exe=executor,
                                          program=self.origin_train_prog,
                                          params_path=MODEL_PATH)
            self.logger.info(
                "Pre-trained model file in {} has been loaded".format(
                    MODEL_PATH))

        self.logger.info("Ready to train the model.Executing...")

        # 执行MAX_EPOCH次迭代save_train_snapshot
        for epoch_id in range(total_epoch, MAX_EPOCH):
            # 一个epoch的训练过程,一个迭代
            total_step, loss = self.__run_train_iterable(
                executor, total_step, epoch_id, step_in_epoch)
            step_in_epoch = 0
            self.logger.info(
                'Epoch {epoch} done, train mean loss is {loss}'.format(
                    epoch=epoch_id, loss=loss))
            # 进行一次验证集上的验证
            valid_loss, valid_acc = self.__valid(executor)
            self.logger.info(' Epoch {epoch} Validated'.format(epoch=epoch_id))
            # 进行保存
            info = {"total_step": total_step, "epoch": epoch_id}
            file_path = model_utils.save_train_snapshot(
                executor,
                self.origin_train_prog,
                file_name="{}_epoch{}".format(APP_NAME, epoch_id),
                train_info=info)
            self.logger.info(
                "Snapshot of training process has been saved as folder {}".
                format(file_path))
            # 应用早停策略
            if EARLY_STOPPING:
                need_stop = self.early_stopping_strategy(
                    -valid_acc,
                    threshold=THRESHOLD,
                    standstill_step=STANDSTILL_STEP)
                if need_stop:
                    self.logger.info(
                        "Performance improvement stalled, ending the training process"
                    )
                    break
        # 保存现有模型
        file_path = model_utils.save_train_snapshot(executor,
                                                    self.origin_train_prog,
                                                    APP_NAME)
        self.logger.info(
            "Training process completed. model saved in {}".format(file_path))