def train(args): """OCR training""" if args.model == "crnn_ctc": train_net = ctc_train_net get_feeder_data = get_ctc_feeder_data else: train_net = attention_train_net get_feeder_data = get_attention_feeder_data num_classes = None num_classes = data_reader.num_classes( ) if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network sum_cost, error_evaluator, inference_program, model_average = train_net( args, data_shape, num_classes) # data reader train_reader = data_reader.train(args.batch_size, train_images_dir=args.train_images, train_list_file=args.train_list, cycle=args.total_step > 0, model=args.model) test_reader = data_reader.test(test_images_dir=args.test_images, test_list_file=args.test_list, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) if 'ce_mode' in os.environ: fluid.default_startup_program().random_seed = 90 exe.run(fluid.default_startup_program()) # load init model if args.init_model is not None: model_dir = args.init_model fluid.load(fluid.default_main_program(), model_dir, var_list=fluid.io.get_program_parameter( fluid.default_main_program())) print("Init model from: %s." % args.init_model) train_exe = exe error_evaluator.reset(exe) if args.parallel: train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name) fetch_vars = [sum_cost] + error_evaluator.metrics def train_one_batch(data): var_names = [var.name for var in fetch_vars] if args.parallel: results = train_exe.run(var_names, feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] else: results = train_exe.run(feed=get_feeder_data(data, place), fetch_list=fetch_vars) results = [result[0] for result in results] return results def test(iter_num): error_evaluator.reset(exe) for data in test_reader(): exe.run(inference_program, feed=get_feeder_data(data, place)) _, test_seq_error = error_evaluator.eval(exe) print("\n[%s] - Iter[%d]; Test seq error: %s.\n" % (time.asctime( time.localtime(time.time())), iter_num, str(test_seq_error[0]))) #Note: The following logs are special for CE monitoring. #Other situations do not need to care about these logs. if 'ce_mode' in os.environ: print("kpis test_acc %f" % (1 - test_seq_error[0])) def save_model(args, exe, iter_num): filename = "model_%05d" % iter_num fluid.save(fluid.default_main_program(), os.path.join(args.save_model_dir, filename)) print("Saved model to: %s/%s." % (args.save_model_dir, filename)) iter_num = 0 stop = False start_time = time.time() while not stop: total_loss = 0.0 total_seq_error = 0.0 batch_times = [] # train a pass for data in train_reader(): if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num: stop = True break if iter_num < args.skip_batch_num: print("Warm-up iteration") if iter_num == args.skip_batch_num: profiler.reset_profiler() start = time.time() results = train_one_batch(data) batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) total_loss += results[0] total_seq_error += results[2] iter_num += 1 # training log if iter_num % args.log_period == 0: print("\n[%s] - Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % (time.asctime(time.localtime( time.time())), iter_num, total_loss / (args.log_period * args.batch_size), total_seq_error / (args.log_period * args.batch_size))) if 'ce_mode' in os.environ: print("kpis train_cost %f" % (total_loss / (args.log_period * args.batch_size))) print("kpis train_acc %f" % (1 - total_seq_error / (args.log_period * args.batch_size))) total_loss = 0.0 total_seq_error = 0.0 # evaluate if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): test(iter_num) else: test(iter_num) # save model if iter_num % args.save_model_period == 0: if model_average: with model_average.apply(exe): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) end_time = time.time() if 'ce_mode' in os.environ: print("kpis train_duration %f" % (end_time - start_time)) # Postprocess benchmark data latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iter_num * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def train(args): with fluid.dygraph.guard(): backward_strategy = fluid.dygraph.BackwardStrategy() backward_strategy.sort_sum_gradient = True ocr_attention = OCRAttention("ocr_attention") if Config.learning_rate_decay == "piecewise_decay": learning_rate = fluid.layers.piecewise_decay( [50000], [Config.LR, Config.LR * 0.01]) else: learning_rate = Config.LR optimizer = fluid.optimizer.Adam(learning_rate=0.001) dy_param_init_value = {} grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 ) train_reader = data_reader.train( Config.batch_size, max_length=Config.max_length, train_images_dir=args.train_images, train_list_file=args.train_list, cycle=args.total_step > 0, shuffle=True, model=args.model) infer_image= './data/data/test_images/' infer_files = './data/data/test.list' test_reader = data_reader.train( Config.batch_size, 1000, train_images_dir= infer_image, train_list_file= infer_files, cycle=False, model=args.model) def eval(): ocr_attention.eval() total_loss = 0.0 total_step = 0.0 equal_size = 0 for data in test_reader(): data_dict = get_attention_feeder_data(data) label_in = to_variable(data_dict["label_in"]) label_out = to_variable(data_dict["label_out"]) label_out._stop_gradient = True label_out.trainable = False img = to_variable(data_dict["pixel"]) prediction = ocr_attention(img, label_in) prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) score, topk = layers.topk( prediction, 1) seq = topk.numpy() seq = seq.reshape( ( args.batch_size, -1)) mask = data_dict['mask'].reshape( (args.batch_size, -1)) seq_len = np.sum( mask, -1) trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1)) for i in range( args.batch_size ): length = int(seq_len[i] -1 ) trans = seq[i][:length - 1] ref = trans_ref[i][ : length - 1] if np.array_equal( trans, ref ): equal_size += 1 total_step += args.batch_size print( "eval cost", equal_size / total_step ) total_step = 0 epoch_num = 20 for epoch in range(epoch_num): batch_id = 0 total_loss = 0.0 for data in train_reader(): total_step += 1 data_dict = get_attention_feeder_data(data) label_in = to_variable(data_dict["label_in"]) label_out = to_variable(data_dict["label_out"]) label_out._stop_gradient = True label_out.trainable = False img = to_variable(data_dict["pixel"]) prediction = ocr_attention(img, label_in) prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False) loss = fluid.layers.cross_entropy( input=prediction, label=label_out) mask = to_variable(data_dict["mask"]) loss = layers.elementwise_mul( loss, mask, axis=0) avg_loss = fluid.layers.reduce_sum(loss) total_loss += avg_loss.numpy() avg_loss.backward() optimizer.minimize(avg_loss, grad_clip=grad_clip) ocr_attention.clear_gradients() framework._dygraph_tracer()._clear_ops() if batch_id > 0 and batch_id % 1000 == 0: print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, total_loss / args.batch_size / 1000)) total_loss = 0.0 if total_step > 0 and total_step % 2000 == 0: model_value = ocr_attention.state_dict() np.savez( "model/" + str(total_step), **model_value ) ocr_attention.eval() eval() ocr_attention.train() batch_id +=1