Python InputField.build示例

编程语言: Python

命名空间/包名称: palm.toolkit.input_field

类/类型: InputField

方法/功能: build

hotexamples.com的示例: 6

Python InputField.build - 已找到6个示例。这些是从开源项目中提取的最受好评的palm.toolkit.input_field.InputField.build现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

InputField(6)

build(6)

示例#1

显示文件

def do_save_inference_model(args):

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        test_prog.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed

        with fluid.unique_name.guard():

            # define inputs of the network

            input_slots = [
                {
                    "name": "src_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "pos_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "sent_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "input_mask",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "float32"
                },
                {
                    "name": "input_span_mask",
                    "shape": (-1, args.max_seq_len),
                    "dtype": "float32"
                },
                {
                    "name": "unique_id",
                    "shape": (-1, 1),
                    "dtype": "int64"
                },
            ]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            predictions = create_net(is_training=False,
                                     model_input=input_field,
                                     args=args)

            # declare the outputs to be fetched
            unique_ids, top_k_start_log_probs, top_k_start_indexes, top_k_end_log_probs, top_k_end_indexes = predictions

            # put all fetched outputs into fetch_list
            fetch_list = [
                unique_ids.name, top_k_start_log_probs.name,
                top_k_start_indexes.name, top_k_end_log_probs.name,
                top_k_end_indexes.name
            ]

    # prepare predicting

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)

    exe.run(startup_prog)

    assert (args.init_from_params) or (args.init_from_pretrain_model)

    if args.init_from_params:
        init_from_params(args, exe, test_prog)

    elif args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, test_prog)

    # saving inference model

    fluid.io.save_inference_model(
        args.inference_model_dir,
        feeded_var_names=[
            input_field.src_ids.name, input_field.pos_ids.name,
            input_field.sent_ids.name, input_field.input_mask.name,
            input_field.input_span_mask.name, input_field.unique_id.name
        ],
        target_vars=[
            unique_ids, top_k_start_log_probs, top_k_start_indexes,
            top_k_end_log_probs, top_k_end_indexes
        ],
        executor=exe,
        main_program=test_prog,
        model_filename="model.pdmodel",
        params_filename="params.pdparams")

    print("save inference model at %s" % (args.inference_model_dir))

示例#2

显示文件

def do_predict(args):
    if args.use_cuda:
        dev_count = fluid.core.get_cuda_device_count()
        place = fluid.CUDAPlace(0)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()
    # define the data generator
    processor = reader.DataProcessor(fpattern=args.predict_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=False,
                                     batch_size=args.batch_size,
                                     device_count=dev_count,
                                     pool_size=args.pool_size,
                                     sort_type=reader.SortType.NONE,
                                     shuffle=False,
                                     shuffle_batch=False,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head)
    batch_generator = processor.data_generator(phase="predict", place=place)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()
    trg_idx2word = reader.DataProcessor.load_dict(
        dict_path=args.trg_vocab_fpath, reverse=True)

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():

            # define input and reader

            input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields
            input_slots = [{
                "name": name,
                "shape": desc.input_descs[name][0],
                "dtype": desc.input_descs[name][1]
            } for name in input_field_names]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            out_ids, out_scores, weight_matrix = create_net(
                is_training=False, model_input=input_field, args=args)

            out_ids.persistable = out_scores.persistable = weight_matrix.persistable = True
    # This is used here to set dropout to the test mode.
    test_prog = test_prog.clone(for_test=True)

    # prepare predicting

    ## define the executor and program for training

    exe = fluid.Executor(place)

    exe.run(startup_prog)
    assert (args.init_from_params) or (args.init_from_pretrain_model)

    if args.init_from_params:
        init_from_params(args, exe, test_prog)

    elif args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, test_prog)

    # to avoid a longer length than training, reset the size of position encoding to max_length
    for pos_enc_param_name in desc.pos_enc_param_names:
        pos_enc_param = fluid.global_scope().find_var(
            pos_enc_param_name).get_tensor()

        pos_enc_param.set(
            position_encoding_init(args.max_length + 1, args.d_model), place)

    exe_strategy = fluid.ExecutionStrategy()
    # to clear tensor array after each iteration
    exe_strategy.num_iteration_per_drop_scope = 1
    compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel(
        exec_strategy=exe_strategy, places=place)

    f = open(args.output_file, "wb")
    # start predicting
    ## decorate the pyreader with batch_generator
    input_field.reader.decorate_batch_generator(batch_generator)
    input_field.reader.start()
    while True:
        try:
            #print(input_field.src_word)
            seq_ids, seq_scores, out_weight = exe.run(
                test_prog,
                fetch_list=[out_ids.name, out_scores.name, weight_matrix],
                return_numpy=False)
            # print(out_weight)
            #print(weight_matrix)

            # How to parse the results:
            #   Suppose the lod of seq_ids is:
            #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
            #   then from lod[0]:
            #     there are 2 source sentences, beam width is 3.
            #   from lod[1]:
            #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
            #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
            hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
            scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
            for i in range(len(seq_ids.lod()[0]) -
                           1):  # for each source sentence
                start = seq_ids.lod()[0][i]
                end = seq_ids.lod()[0][i + 1]
                for j in range(end - start):  # for each candidate
                    sub_start = seq_ids.lod()[1][start + j]
                    sub_end = seq_ids.lod()[1][start + j + 1]
                    hyps[i].append(b" ".join([
                        trg_idx2word[idx] for idx in post_process_seq(
                            np.array(seq_ids)[sub_start:sub_end], args.bos_idx,
                            args.eos_idx)
                    ]))
                    scores[i].append(np.array(seq_scores)[sub_end - 1])
                    f.write(hyps[i][-1] + b"\n")
                    if len(hyps[i]) >= args.n_best:
                        break
        except fluid.core.EOFException:
            break

    f.close()

示例#3

显示文件

文件： train.py 项目： smallZh/PaddleNLP

def do_train(args):
    if args.use_cuda:
        if num_trainers > 1:  # for multi-process gpu training
            dev_count = 1
        else:
            dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
        place = fluid.CUDAPlace(gpu_id)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()

    # define the data generator
    processor = reader.DataProcessor(fpattern=args.training_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=args.use_token_batch,
                                     batch_size=args.batch_size,
                                     device_count=dev_count,
                                     pool_size=args.pool_size,
                                     sort_type=args.sort_type,
                                     shuffle=args.shuffle,
                                     shuffle_batch=args.shuffle_batch,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head)
    batch_generator = processor.data_generator(phase="train")
    if num_trainers > 1:  # for multi-process gpu training
        batch_generator = fluid.contrib.reader.distributed_batch_reader(
            batch_generator)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()

    train_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()
    random_seed = eval(str(args.random_seed))
    if random_seed is not None:
        train_prog.random_seed = random_seed
        startup_prog.random_seed = random_seed

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():

            # define input and reader

            input_field_names = desc.encoder_data_input_fields + \
                    desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields
            input_slots = [{
                "name": name,
                "shape": desc.input_descs[name][0],
                "dtype": desc.input_descs[name][1]
            } for name in input_field_names]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            sum_cost, avg_cost, token_num = create_net(is_training=True,
                                                       model_input=input_field,
                                                       args=args)

            sum_cost.persistable = avg_cost.persistable = token_num.persistable = True

            # define the optimizer

            with fluid.default_main_program()._lr_schedule_guard():
                learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
                    args.d_model, args.warmup_steps) * args.learning_rate

            optimizer = fluid.optimizer.Adam(learning_rate=learning_rate,
                                             beta1=args.beta1,
                                             beta2=args.beta2,
                                             epsilon=float(args.eps))
            optimizer.minimize(avg_cost)

    # prepare training

    ## decorate the pyreader with batch_generator
    input_field.reader.decorate_batch_generator(batch_generator)

    ## define the executor and program for training

    exe = fluid.Executor(place)

    exe.run(startup_prog)
    # init position_encoding
    for pos_enc_param_name in desc.pos_enc_param_names:
        pos_enc_param = fluid.global_scope().find_var(
            pos_enc_param_name).get_tensor()

        pos_enc_param.set(
            position_encoding_init(args.max_length + 1, args.d_model), place)

    assert (args.init_from_checkpoint == "") or (args.init_from_pretrain_model
                                                 == "")

    ## init from some checkpoint, to resume the previous training
    if args.init_from_checkpoint:
        init_from_checkpoint(args, exe, train_prog)

    ## init from some pretrain models, to better solve the current task
    if args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, train_prog)

    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.enable_inplace = True
    exec_strategy = fluid.ExecutionStrategy()
    if num_trainers > 1:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_cost.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    # the best cross-entropy value with label smoothing
    loss_normalizer = -(
        (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) +
        args.label_smooth_eps * np.log(args.label_smooth_eps /
                                       (args.trg_vocab_size - 1) + 1e-20))
    # start training

    step_idx = 0
    for pass_id in range(args.epoch):
        pass_start_time = time.time()
        input_field.reader.start()

        batch_id = 0
        while True:
            try:
                outs = exe.run(
                    compiled_train_prog,
                    fetch_list=[sum_cost.name, token_num.name] if step_idx %
                    args.print_step == 0 else [])

                if step_idx % args.print_step == 0:
                    sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                        outs[1])
                    # sum the cost from multi-devices
                    total_sum_cost = sum_cost_val.sum()
                    total_token_num = token_num_val.sum()
                    total_avg_cost = total_sum_cost / total_token_num

                    if step_idx == 0:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f" %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)])))
                        avg_batch_time = time.time()
                    else:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f, speed: %.2f step/s"
                            % (step_idx, pass_id, batch_id, total_avg_cost,
                               total_avg_cost - loss_normalizer,
                               np.exp([min(total_avg_cost, 100)
                                       ]), args.print_step /
                               (time.time() - avg_batch_time)))
                        avg_batch_time = time.time()

                if step_idx % args.save_step == 0 and step_idx != 0:

                    if args.save_checkpoint:
                        save_checkpoint(args, exe, train_prog,
                                        "step_" + str(step_idx))

                    if args.save_param:
                        save_param(args, exe, train_prog,
                                   "step_" + str(step_idx))

                batch_id += 1
                step_idx += 1

            except fluid.core.EOFException:
                input_field.reader.reset()
                break

        time_consumed = time.time() - pass_start_time

    if args.save_checkpoint:
        save_checkpoint(args, exe, train_prog, "step_final")

    if args.save_param:
        save_param(args, exe, train_prog, "step_final")

    if args.enable_ce:  # For CE
        print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost))
        print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))

示例#4

显示文件

文件： predict.py 项目： ZeyuChen/models_new

def do_predict(args):

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        test_prog.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed

        with fluid.unique_name.guard():

            # define inputs of the network

            input_slots = [
                {
                    "name": "src_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "pos_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "sent_ids",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "int64"
                },
                {
                    "name": "input_mask",
                    "shape": (-1, args.max_seq_len, 1),
                    "dtype": "float32"
                },
                {
                    "name": "input_span_mask",
                    "shape": (-1, args.max_seq_len),
                    "dtype": "float32"
                },
                {
                    "name": "unique_id",
                    "shape": (-1, 1),
                    "dtype": "int64"
                },
            ]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            predictions = create_net(
                is_training=False, model_input=input_field, args=args)

            # declare the outputs to be fetched
            unique_ids, top_k_start_log_probs, top_k_start_indexes, top_k_end_log_probs, top_k_end_indexes = predictions

            # make them persistable, will be removed in PaddlePaddle 1.6
            unique_ids.persistable = True
            top_k_start_log_probs.persistable = True
            top_k_start_indexes.persistable = True
            top_k_end_log_probs.persistable = True
            top_k_end_indexes.persistable = True

            # put all fetched outputs into fetch_list
            fetch_list = [
                unique_ids.name, top_k_start_log_probs.name,
                top_k_start_indexes.name, top_k_end_log_probs.name,
                top_k_end_indexes.name
            ]

    # prepare predicting

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)

    exe.run(startup_prog)

    assert (args.init_from_params) or (args.init_from_pretrain_model)

    if args.init_from_params:
        init_from_params(args, exe, test_prog)

    elif args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, test_prog)

    compiled_test_prog = fluid.CompiledProgram(test_prog)

    # start predicting

    ## define data-processer and start data-reading
    processor = DataProcessor(
        vocab_path=args.vocab_path,
        do_lower_case=args.do_lower_case,
        max_seq_length=args.max_seq_len,
        in_tokens=args.in_tokens,
        doc_stride=args.doc_stride,
        do_stride=args.do_stride,
        max_query_length=args.max_query_len)

    ## define the data generator
    batch_generator = processor.data_generator(
        data_path=args.predict_file,
        batch_size=args.batch_size,
        phase="predict",
        shuffle=False,
        dev_count=1,
        epoch=1)

    ## decorate the pyreader with batch_generator
    input_field.reader.decorate_batch_generator(batch_generator)

    all_results = []
    RawResult = collections.namedtuple("RawResult", [
        "unique_id", "top_k_start_log_probs", "top_k_start_indexes",
        "top_k_end_log_probs", "top_k_end_indexes"
    ])

    input_field.reader.start()
    while True:
        try:

            np_unique_ids, np_top_k_start_log_probs, np_top_k_start_indexes, \
                np_top_k_end_log_probs, np_top_k_end_indexes = exe.run(compiled_test_prog, fetch_list = fetch_list)

            for idx in range(np_unique_ids.shape[0]):
                if len(all_results) % 1000 == 0:
                    print("Processing example: %d" % len(all_results))
                unique_id = int(np_unique_ids[idx])

                top_k_start_log_probs = [
                    float(x) for x in np_top_k_start_log_probs[idx].flat
                ]
                top_k_start_indexes = [
                    int(x) for x in np_top_k_start_indexes[idx].flat
                ]
                top_k_end_log_probs = [
                    float(x) for x in np_top_k_end_log_probs[idx].flat
                ]
                top_k_end_indexes = [
                    int(x) for x in np_top_k_end_indexes[idx].flat
                ]

                all_results.append(
                    RawResult(
                        unique_id=unique_id,
                        top_k_start_log_probs=top_k_start_log_probs,
                        top_k_start_indexes=top_k_start_indexes,
                        top_k_end_log_probs=top_k_end_log_probs,
                        top_k_end_indexes=top_k_end_indexes))

        except fluid.core.EOFException:
            break

    features = processor.get_features(
        processor.predict_examples, is_training=False)

    write_predictions(processor.predict_examples, features, all_results,
                      args.n_best_size, args.max_answer_length,
                      args.do_lower_case, args.output_prediction_file,
                      args.output_nbest_file, None, args.start_top_k,
                      args.end_top_k, args.verbose)

示例#5

显示文件

文件： inference_model.py 项目： smallZh/PaddleNLP

def do_save_inference_model(args):
    if args.use_cuda:
        dev_count = fluid.core.get_cuda_device_count()
        place = fluid.CUDAPlace(0)
    else:
        dev_count = int(os.environ.get('CPU_NUM', 1))
        place = fluid.CPUPlace()

    test_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(test_prog, startup_prog):
        with fluid.unique_name.guard():

            # define input and reader

            input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields
            input_slots = [{
                "name": name,
                "shape": desc.input_descs[name][0],
                "dtype": desc.input_descs[name][1]
            } for name in input_field_names]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            predictions = create_net(is_training=False,
                                     model_input=input_field,
                                     args=args)
            out_ids, out_scores = predictions

    # This is used here to set dropout to the test mode.
    test_prog = test_prog.clone(for_test=True)

    # prepare predicting

    ## define the executor and program for training

    exe = fluid.Executor(place)

    exe.run(startup_prog)
    assert (args.init_from_params) or (args.init_from_pretrain_model)

    if args.init_from_params:
        init_from_params(args, exe, test_prog)

    elif args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, test_prog)

    # saving inference model

    fluid.io.save_inference_model(args.inference_model_dir,
                                  feeded_var_names=input_field_names,
                                  target_vars=[out_ids, out_scores],
                                  executor=exe,
                                  main_program=test_prog,
                                  model_filename="model.pdmodel",
                                  params_filename="params.pdparams")

    print("save inference model at %s" % (args.inference_model_dir))

示例#6

显示文件

def do_train(args):

    train_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()

    with fluid.program_guard(train_prog, startup_prog):
        train_prog.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed

        with fluid.unique_name.guard():

            # define input and reader

            input_slots = [{
                "name": "src_ids",
                "shape": (-1, args.max_seq_len, 1),
                "dtype": "int64"
            }, {
                "name": "pos_ids",
                "shape": (-1, args.max_seq_len, 1),
                "dtype": "int64"
            }, {
                "name": "sent_ids",
                "shape": (-1, args.max_seq_len, 1),
                "dtype": "int64"
            }, {
                "name": "input_mask",
                "shape": (-1, args.max_seq_len, 1),
                "dtype": "float32"
            }, {
                "name": "input_span_mask",
                "shape": (-1, args.max_seq_len),
                "dtype": "float32"
            }, {
                "name": "start_positions",
                "shape": (-1, 1),
                "dtype": "int64"
            }, {
                "name": "end_positions",
                "shape": (-1, 1),
                "dtype": "int64"
            }, {
                "name": "is_null_answer",
                "shape": (-1, 1),
                "dtype": "int64"
            }]

            input_field = InputField(input_slots)
            input_field.build(build_pyreader=True)

            # define the network

            loss = create_net(is_training=True,
                              model_input=input_field,
                              args=args)

            loss.persistable = True

            # define the optimizer

            if args.use_cuda:
                dev_count = fluid.core.get_cuda_device_count()
            else:
                dev_count = int(
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

            # as we need to get the max training steps for warmup training,
            # we define the data processer in advance
            # usually, we can declare data processor later, outsides the program_gurad scope

            processor = DataProcessor(vocab_path=args.vocab_path,
                                      do_lower_case=args.do_lower_case,
                                      max_seq_length=args.max_seq_len,
                                      in_tokens=args.in_tokens,
                                      doc_stride=args.doc_stride,
                                      do_stride=args.do_stride,
                                      max_query_length=args.max_query_len)

            ## define the data generator
            batch_generator = processor.data_generator(
                data_path=args.training_file,
                batch_size=args.batch_size,
                phase="train",
                shuffle=True,
                dev_count=dev_count,
                epoch=args.epoch)

            num_train_examples = processor.get_num_examples(phase='train')
            max_train_steps = args.epoch * num_train_examples // dev_count // args.batch_size
            warmup_steps = int(max_train_steps * args.warmup_proportion)

            print(max_train_steps, warmup_steps, num_train_examples)

            optimizor = optimization(loss=loss,
                                     warmup_steps=warmup_steps,
                                     num_train_steps=max_train_steps,
                                     learning_rate=args.learning_rate,
                                     train_program=train_prog,
                                     startup_prog=startup_prog,
                                     weight_decay=args.weight_decay,
                                     scheduler=args.lr_scheduler,
                                     use_fp16=args.use_fp16,
                                     loss_scaling=args.loss_scaling)

    # prepare training

    ## decorate the pyreader with batch_generator
    input_field.reader.decorate_batch_generator(batch_generator)

    ## define the executor and program for training

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)

    exe.run(startup_prog)

    assert (args.init_from_checkpoint == "") or (args.init_from_pretrain_model
                                                 == "")

    ## init from some checkpoint, to resume the previous training
    if args.init_from_checkpoint:
        init_from_checkpoint(args, exe, train_prog)

    ## init from some pretrain models, to better solve the current task
    if args.init_from_pretrain_model:
        init_from_pretrain_model(args, exe, train_prog)

    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.enable_inplace = True

    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name, build_strategy=build_strategy)

    # start training

    step = 0
    for epoch_step in range(args.epoch):
        input_field.reader.start()
        while True:
            try:

                # this is for minimizing the fetching op, saving the training speed.
                if step % args.print_step == 0:
                    fetch_list = [loss.name]
                else:
                    fetch_list = []

                output = exe.run(compiled_train_prog, fetch_list=fetch_list)

                if step % args.print_step == 0:
                    print("step: %d, loss: %.4f" % (step, np.sum(output[0])))

                if step % args.save_step == 0 and step != 0:

                    if args.save_checkpoint:
                        save_checkpoint(args, exe, train_prog,
                                        "step_" + str(step))

                    if args.save_param:
                        save_param(args, exe, train_prog, "step_" + str(step))

                step += 1

            except fluid.core.EOFException:
                input_field.reader.reset()
                break

    if args.save_checkpoint:
        save_checkpoint(args, exe, train_prog, "step_final")

    if args.save_param:
        save_param(args, exe, train_prog, "step_final")