def init_infer_program(self): # define inferer self.infer_program = fluid.Program() startup_prog = fluid.Program() # prepare the network with fluid.program_guard(self.infer_program, startup_prog): with fluid.unique_name.guard(): self.infer_feeder, self.infer_log_probs, _ = self.create_network(is_infer=True) self.infer_program = self.infer_program.clone(for_test=True) self.infer_exe = fluid.Executor(self._place) self.infer_exe.run(startup_prog) # init param from pretrained_model if not self._init_from_pretrained_model: exit("预训练模型文件不存在!") self.init_from_pretrained_model(self.infer_exe, self.infer_program) # 支持多卡推理 build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() self.infer_compiled_prog = compiler.CompiledProgram(self.infer_program).with_data_parallel( build_strategy=build_strategy, exec_strategy=exec_strategy)
def train(self, train_batch_reader, dev_batch_reader, learning_rate, gradient_clipping, num_epoch, batch_size, num_samples, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_epoch: Number of training epochs. :type num_epoch: int :param batch_size: Number of batch size. :type batch_size: int :param num_samples: The num of train samples. :type num_samples: int :param num_iterations_print: Number of training iterations for printing a training loss. :type num_iteratons_print: int :param only_train_batch:Every epoch only train only_train_batch batch. Avoid insufficient video memory :type only_train_batch:int :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(self._output_model_dir): mkpath(self._output_model_dir) if isinstance(self._place, fluid.CUDAPlace): dev_count = fluid.core.get_cuda_device_count() learning_rate = learning_rate * dev_count else: dev_count = int(os.environ.get('CPU_NUM', 1)) # prepare the network train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_reader, _, ctc_loss = self.create_network() # 学习率 learning_rate = fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=num_samples / batch_size / dev_count, decay_rate=0.83, staircase=True) # 准备优化器 optimizer = fluid.optimizer.AdamOptimizer( learning_rate=learning_rate, regularization=fluid.regularizer.L2Decay(0.0001), grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=gradient_clipping)) optimizer.minimize(loss=ctc_loss) exe = fluid.Executor(self._place) exe.run(startup_prog) # init from some pretrain models, to better solve the current task pre_epoch = 0 if self._init_from_pretrained_model: pre_epoch = self.init_from_pretrained_model(exe, train_program) build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() # pass the build_strategy to with_data_parallel API train_compiled_prog = compiler.CompiledProgram(train_program).with_data_parallel(loss_name=ctc_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) train_reader.set_batch_generator(train_batch_reader) train_step = 0 test_step = 0 num_batch = -1 # run train for epoch_id in range(num_epoch): train_reader.start() epoch_loss = [] time_begin = time.time() batch_id = 0 while True: try: fetch_list = [ctc_loss.name, learning_rate.name] if batch_id % 100 == 0: fetch = exe.run(program=train_compiled_prog, fetch_list=fetch_list, return_numpy=False) each_loss = fetch[0] each_learning_rate = np.array(fetch[1])[0] epoch_loss.extend(np.array(each_loss[0]) / batch_size) print("Train [%s] epoch: [%d/%d], batch: [%d/%d], learning rate: %f, train loss: %f\n" % (datetime.now(), epoch_id, num_epoch, batch_id, num_batch, each_learning_rate, np.mean(each_loss[0]) / batch_size)) # 记录训练损失值 self.writer.add_scalar('Train loss', np.mean(each_loss[0]) / batch_size, train_step) self.writer.add_scalar('Learning rate', each_learning_rate, train_step) train_step += 1 else: _ = exe.run(program=train_compiled_prog, fetch_list=[], return_numpy=False) # 每2000个batch保存一次模型 if batch_id % 2000 == 0 and batch_id != 0: self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) batch_id = batch_id + 1 except fluid.core.EOFException: train_reader.reset() break num_batch = batch_id # 每一个epoch保存一次模型 self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) used_time = time.time() - time_begin if test_off: print('======================last Train=====================') print("Train time: %f sec, epoch: %d, train loss: %f\n" % (used_time, epoch_id, np.mean(np.array(epoch_loss)))) print('======================last Train=====================') else: print('\n======================Begin test=====================') # 设置临时模型的路径 self._init_from_pretrained_model = self.save_model_path # 执行测试 test_result = self.test(test_reader=dev_batch_reader) print("Train time: %f sec, epoch: %d, train loss: %f, test %s: %f" % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), self.error_rate_type, test_result)) print('======================Stop Train=====================\n') # 记录测试结果 self.writer.add_scalar('Test %s' % self.error_rate_type, test_result, test_step) test_step += 1 self.save_param(exe, train_program, "step_final") print("\n------------Training finished!!!-------------")
def train(self, train_batch_reader, dev_batch_reader, feeding_dict, learning_rate, gradient_clipping, num_epoch, batch_size, num_samples, save_epoch=100, num_iterations_print=100, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_epoch: Number of training epochs. :type num_epoch: int :param batch_size: Number of batch size. :type batch_size: int :param num_samples: The num of train samples. :type num_samples: int :param save_epoch: Number of training iterations for save checkpoint and params. :type save_epoch: int :param num_iterations_print: Number of training iterations for printing a training loss. :type num_iteratons_print: int :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(self._output_model_dir): mkpath(self._output_model_dir) # adapt the feeding dict according to the network adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) if isinstance(self._place, fluid.CUDAPlace): dev_count = fluid.core.get_cuda_device_count() else: dev_count = int(os.environ.get('CPU_NUM', 1)) # prepare the network train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_reader, log_probs, ctc_loss = self.create_network() # prepare optimizer optimizer = fluid.optimizer.AdamOptimizer( learning_rate=fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=num_samples / batch_size / dev_count, decay_rate=0.83, staircase=True)) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=gradient_clipping)) optimizer.minimize(loss=ctc_loss) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_reader, _, ctc_loss = self.create_network() test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(self._place) exe.run(startup_prog) # init from some pretrain models, to better solve the current task pre_epoch = 0 if self._init_from_pretrained_model: pre_epoch = self.init_from_pretrained_model(exe, train_program) build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() # pass the build_strategy to with_data_parallel API compiled_prog = compiler.CompiledProgram( train_program).with_data_parallel(loss_name=ctc_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) train_reader.set_batch_generator(train_batch_reader) test_reader.set_batch_generator(dev_batch_reader) # run train for epoch_id in range(num_epoch): train_reader.start() epoch_loss = [] time_begin = time.time() batch_id = 0 step = 0 while True: try: fetch_list = [ctc_loss.name] if batch_id % num_iterations_print == 0: fetch = exe.run(program=compiled_prog, fetch_list=fetch_list, return_numpy=False) each_loss = fetch[0] epoch_loss.extend(np.array(each_loss[0]) / batch_size) print("epoch: %d, batch: %d, train loss: %f\n" % (epoch_id, batch_id, np.mean(each_loss[0]) / batch_size)) else: each_loss = exe.run(program=compiled_prog, fetch_list=[], return_numpy=False) batch_id = batch_id + 1 except fluid.core.EOFException: train_reader.reset() break time_end = time.time() used_time = time_end - time_begin if test_off: print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" % (used_time, epoch_id, np.mean(np.array(epoch_loss)))) else: print('\n----------Begin test...') test_loss = self.test(exe, dev_batch_reader=dev_batch_reader, test_program=test_prog, test_reader=test_reader, fetch_list=[ctc_loss]) print( "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f" % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), test_loss / batch_size)) if (epoch_id + 1) % save_epoch == 0: self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) self.save_param(exe, train_program, "step_final") print("\n------------Training finished!!!-------------")