예제 #1
0
파일: train.py 프로젝트: zhong110020/models
def train(args):
    """OCR training"""

    if args.model == "crnn_ctc":
        train_net = ctc_train_net
        get_feeder_data = get_ctc_feeder_data
    else:
        train_net = attention_train_net
        get_feeder_data = get_attention_feeder_data

    num_classes = None
    num_classes = data_reader.num_classes(
    ) if num_classes is None else num_classes
    data_shape = data_reader.data_shape()
    # define network
    sum_cost, error_evaluator, inference_program, model_average = train_net(
        args, data_shape, num_classes)

    # data reader
    train_reader = data_reader.train(args.batch_size,
                                     train_images_dir=args.train_images,
                                     train_list_file=args.train_list,
                                     cycle=args.total_step > 0,
                                     model=args.model)
    test_reader = data_reader.test(test_images_dir=args.test_images,
                                   test_list_file=args.test_list,
                                   model=args.model)

    # prepare environment
    place = fluid.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)

    if 'ce_mode' in os.environ:
        fluid.default_startup_program().random_seed = 90

    exe.run(fluid.default_startup_program())

    # load init model
    if args.init_model is not None:
        model_dir = args.init_model
        fluid.load(fluid.default_main_program(),
                   model_dir,
                   var_list=fluid.io.get_program_parameter(
                       fluid.default_main_program()))
        print("Init model from: %s." % args.init_model)

    train_exe = exe
    error_evaluator.reset(exe)
    if args.parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name)

    fetch_vars = [sum_cost] + error_evaluator.metrics

    def train_one_batch(data):
        var_names = [var.name for var in fetch_vars]
        if args.parallel:
            results = train_exe.run(var_names,
                                    feed=get_feeder_data(data, place))
            results = [np.array(result).sum() for result in results]
        else:
            results = train_exe.run(feed=get_feeder_data(data, place),
                                    fetch_list=fetch_vars)
            results = [result[0] for result in results]
        return results

    def test(iter_num):
        error_evaluator.reset(exe)
        for data in test_reader():
            exe.run(inference_program, feed=get_feeder_data(data, place))
        _, test_seq_error = error_evaluator.eval(exe)
        print("\n[%s] - Iter[%d]; Test seq error: %s.\n" % (time.asctime(
            time.localtime(time.time())), iter_num, str(test_seq_error[0])))

        #Note: The following logs are special for CE monitoring.
        #Other situations do not need to care about these logs.
        if 'ce_mode' in os.environ:
            print("kpis	test_acc	%f" % (1 - test_seq_error[0]))

    def save_model(args, exe, iter_num):
        filename = "model_%05d" % iter_num
        fluid.save(fluid.default_main_program(),
                   os.path.join(args.save_model_dir, filename))
        print("Saved model to: %s/%s." % (args.save_model_dir, filename))

    iter_num = 0
    stop = False
    start_time = time.time()
    while not stop:
        total_loss = 0.0
        total_seq_error = 0.0
        batch_times = []
        # train a pass
        for data in train_reader():
            if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num:
                stop = True
                break
            if iter_num < args.skip_batch_num:
                print("Warm-up iteration")
            if iter_num == args.skip_batch_num:
                profiler.reset_profiler()
            start = time.time()
            results = train_one_batch(data)
            batch_time = time.time() - start
            fps = args.batch_size / batch_time
            batch_times.append(batch_time)
            total_loss += results[0]
            total_seq_error += results[2]

            iter_num += 1
            # training log
            if iter_num % args.log_period == 0:
                print("\n[%s] - Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" %
                      (time.asctime(time.localtime(
                          time.time())), iter_num, total_loss /
                       (args.log_period * args.batch_size), total_seq_error /
                       (args.log_period * args.batch_size)))
                if 'ce_mode' in os.environ:
                    print("kpis	train_cost	%f" %
                          (total_loss / (args.log_period * args.batch_size)))
                    print("kpis	train_acc	%f" %
                          (1 - total_seq_error /
                           (args.log_period * args.batch_size)))
                total_loss = 0.0
                total_seq_error = 0.0

            # evaluate
            if not args.skip_test and iter_num % args.eval_period == 0:
                if model_average:
                    with model_average.apply(exe):
                        test(iter_num)
                else:
                    test(iter_num)

            # save model
            if iter_num % args.save_model_period == 0:
                if model_average:
                    with model_average.apply(exe):
                        save_model(args, exe, iter_num)
                else:
                    save_model(args, exe, iter_num)
        end_time = time.time()
        if 'ce_mode' in os.environ:
            print("kpis	train_duration	%f" % (end_time - start_time))
        # Postprocess benchmark data
        latencies = batch_times[args.skip_batch_num:]
        latency_avg = np.average(latencies)
        latency_pc99 = np.percentile(latencies, 99)
        fpses = np.divide(args.batch_size, latencies)
        fps_avg = np.average(fpses)
        fps_pc99 = np.percentile(fpses, 1)

        # Benchmark output
        print('\nTotal examples (incl. warm-up): %d' %
              (iter_num * args.batch_size))
        print('average latency: %.5f s, 99pc latency: %.5f s' %
              (latency_avg, latency_pc99))
        print('average fps: %.5f, fps for 99pc latency: %.5f' %
              (fps_avg, fps_pc99))
예제 #2
0
파일: train.py 프로젝트: zhiqiu/benchmark
def train(args):

    with fluid.dygraph.guard():
        backward_strategy = fluid.dygraph.BackwardStrategy()
        backward_strategy.sort_sum_gradient = True
        ocr_attention = OCRAttention("ocr_attention")

        if Config.learning_rate_decay == "piecewise_decay":
            learning_rate = fluid.layers.piecewise_decay(
                [50000], [Config.LR, Config.LR * 0.01])
        else:
            learning_rate = Config.LR
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        dy_param_init_value = {}

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 )

        train_reader = data_reader.train(
            Config.batch_size,
            max_length=Config.max_length,
            train_images_dir=args.train_images,
            train_list_file=args.train_list,
            cycle=args.total_step > 0,
            shuffle=True,
            model=args.model)

        infer_image= './data/data/test_images/'
        infer_files = './data/data/test.list'
        test_reader = data_reader.train(
                Config.batch_size,
                1000,
                train_images_dir= infer_image,
                train_list_file= infer_files,
                cycle=False,
                model=args.model)
        def eval():
            ocr_attention.eval()
            total_loss = 0.0
            total_step = 0.0
            equal_size = 0
            for data in test_reader():
                data_dict = get_attention_feeder_data(data)

                label_in = to_variable(data_dict["label_in"])
                label_out = to_variable(data_dict["label_out"])

                label_out._stop_gradient = True
                label_out.trainable = False

                img = to_variable(data_dict["pixel"])

                prediction = ocr_attention(img, label_in)
                prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False)

                score, topk = layers.topk( prediction, 1)

                seq = topk.numpy()

                seq = seq.reshape( ( args.batch_size, -1))

                mask = data_dict['mask'].reshape( (args.batch_size, -1))
                seq_len = np.sum( mask, -1)

                trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1))
                for i in range( args.batch_size ):
                    length = int(seq_len[i] -1 )
                    trans = seq[i][:length - 1]
                    ref = trans_ref[i][ : length - 1]
                    if np.array_equal( trans, ref ):
                        equal_size += 1

                total_step += args.batch_size
            print( "eval cost", equal_size / total_step )

        total_step = 0
        epoch_num = 20
        for epoch in range(epoch_num):
            batch_id = 0

            total_loss = 0.0
            for data in train_reader():

                total_step += 1
                data_dict = get_attention_feeder_data(data)

                label_in = to_variable(data_dict["label_in"])
                label_out = to_variable(data_dict["label_out"])

                label_out._stop_gradient = True
                label_out.trainable = False

                img = to_variable(data_dict["pixel"])

                prediction = ocr_attention(img, label_in)
                prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False)
                label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False)
                loss = fluid.layers.cross_entropy(
                    input=prediction, label=label_out)

                mask = to_variable(data_dict["mask"])

                loss = layers.elementwise_mul( loss, mask, axis=0)
                avg_loss = fluid.layers.reduce_sum(loss)

                total_loss += avg_loss.numpy()
                avg_loss.backward()
                optimizer.minimize(avg_loss, grad_clip=grad_clip)
                ocr_attention.clear_gradients()

                framework._dygraph_tracer()._clear_ops()

                if batch_id > 0 and batch_id % 1000 == 0:
                    print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, total_loss / args.batch_size / 1000))

                    total_loss = 0.0

                if total_step > 0 and total_step % 2000 == 0:

                    model_value = ocr_attention.state_dict()
                    np.savez( "model/" + str(total_step), **model_value )

                    ocr_attention.eval()
                    eval()
                    ocr_attention.train()

                batch_id +=1