Exemplo n.º 1
0
def prepare_feed_dict_list(data_generator, init_flag, count):
    """
    Prepare the list of feed dict for multi-devices.
    """
    feed_dict_list = []
    if data_generator is not None:  # use_py_reader == False
        data_input_names = encoder_data_input_fields + \
                    decoder_data_input_fields[:-1] + label_data_input_fields
        data = next(data_generator)
        for idx, data_buffer in enumerate(data):
            data_input_dict, num_token = prepare_batch_input(
                data_buffer, data_input_names, ModelHyperParams.eos_idx,
                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                ModelHyperParams.d_model)
            feed_dict_list.append(data_input_dict)
    if init_flag:
        for idx in range(count):
            pos_enc_tables = dict()
            for pos_enc_param_name in pos_enc_param_names:
                pos_enc_tables[pos_enc_param_name] = position_encoding_init(
                    ModelHyperParams.max_length + 1, ModelHyperParams.d_model)
            if len(feed_dict_list) <= idx:
                feed_dict_list.append(pos_enc_tables)
            else:
                feed_dict_list[idx] = dict(
                    list(pos_enc_tables.items()) +
                    list(feed_dict_list[idx].items()))

    return feed_dict_list if len(feed_dict_list) == count else None
Exemplo n.º 2
0
def train_loop(exe, train_progm, init, num_iters, train_data, dev_count,
               sum_cost, avg_cost, lr_scheduler, token_num, predict):

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    start_time = time.time()
    exec_time = 0.0
    for batch_id, data in enumerate(train_data()):
        if batch_id >= num_iters:
            break
        feed_list = []
        total_num_token = 0
        for place_id, data_buffer in enumerate(
                split_data(data, num_part=dev_count)):
            data_input_dict, util_input_dict, num_token = prepare_batch_input(
                data_buffer, data_input_names, util_input_names,
                ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                ModelHyperParams.n_head, ModelHyperParams.d_model)
            total_num_token += num_token
            feed_kv_pairs = data_input_dict.items() + util_input_dict.items()
            lr_rate = lr_scheduler.update_learning_rate()
            feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items()
            feed_list.append(dict(feed_kv_pairs))

            if not init:
                for pos_enc_param_name in pos_enc_param_names:
                    pos_enc = position_encoding_init(
                        ModelHyperParams.max_length + 1,
                        ModelHyperParams.d_model)
                    feed_list[place_id][pos_enc_param_name] = pos_enc
        for feed_dict in feed_list:
            feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token

        exe_start_time = time.time()
        if dev_count > 1:
            # prallel executor
            outs = exe.run(fetch_list=[sum_cost.name, token_num.name],
                           feed=feed_list)
        else:
            # executor
            outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0])
        exec_time += time.time() - exe_start_time

        sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
        total_sum_cost = sum_cost_val.sum()  # sum the cost from multi-devices
        total_token_num = token_num_val.sum()
        total_avg_cost = total_sum_cost / total_token_num
        print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
              (batch_id, total_sum_cost, total_avg_cost,
               np.exp([min(total_avg_cost, 100)])))
        init = True
    return time.time() - start_time, exec_time
Exemplo n.º 3
0
def main():
    args = parse_args()
    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        TrainTaskConfig.label_smooth_eps)

    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps,
                                         TrainTaskConfig.learning_rate)
    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(sum_cost)

    dev_count = fluid.core.get_cuda_device_count()

    train_data = paddle.batch(paddle.dataset.wmt16.train(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size),
                              batch_size=TrainTaskConfig.batch_size)

    # Program to do validation.
    test_program = fluid.default_main_program().clone()
    with fluid.program_guard(test_program):
        test_program = fluid.io.get_inference_program([avg_cost])
    val_data = paddle.batch(paddle.dataset.wmt16.validation(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size),
                            batch_size=TrainTaskConfig.batch_size)

    def test(exe):
        test_total_cost = 0
        test_total_token = 0
        test_data = read_multiple(reader=val_data, count=dev_count)
        for batch_id, data in enumerate(test_data()):
            feed_list = []
            for place_id, data_buffer in enumerate(data):
                data_input_dict, util_input_dict, _ = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                feed_list.append(
                    dict(data_input_dict.items() + util_input_dict.items()))

            outs = exe.run(feed=feed_list,
                           fetch_list=[sum_cost.name, token_num.name])
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            test_total_cost += sum_cost_val.sum()
            test_total_token += token_num_val.sum()
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        exe.run(fluid.framework.default_startup_program())

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       loss_name=sum_cost.name)

    test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                      main_program=test_program,
                                      share_vars_from=train_exe)

    init = False
    train_data = read_multiple(reader=train_data, count=dev_count)

    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            feed_list = []
            total_num_token = 0
            lr_rate = lr_scheduler.update_learning_rate()
            for place_id, data_buffer in enumerate(data):
                data_input_dict, util_input_dict, num_token = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                total_num_token += num_token
                feed_list.append(
                    dict(data_input_dict.items() + util_input_dict.items() +
                         {lr_scheduler.learning_rate.name: lr_rate}.items()))

                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        tensor = position_encoding_init(
                            ModelHyperParams.max_length + 1,
                            ModelHyperParams.d_model)
                        feed_list[place_id][pos_enc_param_name] = tensor
            for feed_dict in feed_list:
                feed_dict[
                    sum_cost.name +
                    "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
                        [1.], dtype="float32")
            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
                                 feed=feed_list)
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            total_sum_cost = sum_cost_val.sum(
            )  # sum the cost from multi devices
            total_token_num = token_num_val.sum()
            total_avg_cost = total_sum_cost / total_token_num
            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                  (pass_id, batch_id, total_sum_cost, total_avg_cost,
                   np.exp([min(total_avg_cost, 100)])))
            init = True
        pass_end_time = time.time()
        # Validate and save the model for inference.
        val_avg_cost, val_ppl = test(test_exe)
        time_consumed = pass_end_time - pass_start_time
        print("pass_id = " + str(pass_id) + " time_consumed = " +
              str(time_consumed))
        if pass_id == TrainTaskConfig.pass_num - 1:
            if args.gpu_card_num == 1:
                test_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32'))
                train_pass_duration_kpi.add_record(time_consumed)
                test_avg_ppl_kpi.persist()
                train_pass_duration_kpi.persist()
            else:
                test_avg_ppl_kpi_card4.add_record(
                    np.array(val_ppl, dtype='float32'))
                train_pass_duration_kpi_card4.add_record(time_consumed)
                test_avg_ppl_kpi_card4.persist()
                train_pass_duration_kpi_card4.persist()
Exemplo n.º 4
0
def do_predict(args):
    if args.use_cuda:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    # define the data generator
    processor = reader.DataProcessor(fpattern=args.predict_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=False,
                                     batch_size=args.batch_size,
                                     device_count=1,
                                     pool_size=args.pool_size,
                                     sort_type=reader.SortType.NONE,
                                     shuffle=False,
                                     shuffle_batch=False,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head)
    batch_generator = processor.data_generator(phase="predict", place=place)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()
    trg_idx2word = reader.DataProcessor.load_dict(
        dict_path=args.trg_vocab_fpath, reverse=True)

    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()

    with fluid.dygraph.guard(place):
        # define data loader
        test_loader = fluid.io.DataLoader.from_generator(capacity=10)
        test_loader.set_batch_generator(batch_generator, places=place)

        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)

        # load the trained model
        assert args.init_from_params, (
            "Please set init_from_params to load the infer model.")
        model_dict, _ = fluid.load_dygraph(
            os.path.join(args.init_from_params, "transformer"))
        # to avoid a longer length than training, reset the size of position
        # encoding to max_length
        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        transformer.load_dict(model_dict)

        # set evaluate mode
        transformer.eval()

        f = open(args.output_file, "wb")
        for input_data in test_loader():
            (src_word, src_pos, src_slf_attn_bias, trg_word,
             trg_src_attn_bias) = input_data
            finished_seq, finished_scores = transformer.beam_search(
                src_word,
                src_pos,
                src_slf_attn_bias,
                trg_word,
                trg_src_attn_bias,
                bos_id=args.bos_idx,
                eos_id=args.eos_idx,
                beam_size=args.beam_size,
                max_len=args.max_out_len)
            finished_seq = finished_seq.numpy()
            finished_scores = finished_scores.numpy()
            for ins in finished_seq:
                for beam_idx, beam in enumerate(ins):
                    if beam_idx >= args.n_best: break
                    id_list = post_process_seq(beam, args.bos_idx,
                                               args.eos_idx)
                    word_list = [trg_idx2word[id] for id in id_list]
                    sequence = b" ".join(word_list) + b"\n"
                    f.write(sequence)
Exemplo n.º 5
0
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
               token_num, predict):
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        logging.info("init fluid.framework.default_startup_program")
        exe.run(fluid.framework.default_startup_program())

    logging.info("begin reader")
    train_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.train_file_pattern,
        token_delimiter=args.token_delimiter,
        use_token_batch=args.use_token_batch,
        batch_size=args.batch_size *
        (1 if args.use_token_batch else dev_count),
        pool_size=args.pool_size,
        sort_type=args.sort_type,
        shuffle=args.shuffle,
        shuffle_batch=args.shuffle_batch,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
        clip_last_batch=False)

    logging.info("begin read multiple")
    train_data = read_multiple(reader=train_data.batch_generator,
                               count=dev_count if args.use_token_batch else 1)

    build_strategy = fluid.BuildStrategy()
    # Since the token number differs among devices, customize gradient scale to
    # use token average cost among multi-devices. and the gradient scale is
    # `1 / token_number` for average cost.
    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
    train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       loss_name=sum_cost.name,
                                       main_program=train_progm,
                                       build_strategy=build_strategy)

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    if args.val_file_pattern is not None:
        test = test_context(train_progm, avg_cost, train_exe, dev_count,
                            data_input_names, util_input_names, sum_cost,
                            token_num)

    # the best cross-entropy value with label smoothing
    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
        (1. - TrainTaskConfig.label_smooth_eps)) +
                        TrainTaskConfig.label_smooth_eps *
                        np.log(TrainTaskConfig.label_smooth_eps /
                               (ModelHyperParams.trg_vocab_size - 1) + 1e-20))
    logging.info("begin train:")
    init = False
    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        logging.info("pass_id:{0}".format(pass_id))
        avg_batch_time = time.time()
        for batch_id, data in enumerate(train_data()):
            logging.info("batch_id:{0} data_len:{1}".format(
                batch_id, len(data)))
            feed_list = []
            total_num_token = 0
            if args.local:
                lr_rate = lr_scheduler.update_learning_rate()
            for place_id, data_buffer in enumerate(
                    split_data(data, num_part=dev_count)):
                data_input_dict, util_input_dict, num_token = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                total_num_token += num_token
                feed_kv_pairs = data_input_dict.items(
                ) + util_input_dict.items()
                if args.local:
                    feed_kv_pairs += {
                        lr_scheduler.learning_rate.name: lr_rate
                    }.items()
                feed_list.append(dict(feed_kv_pairs))

                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        pos_enc = position_encoding_init(
                            ModelHyperParams.max_length + 1,
                            ModelHyperParams.d_model)
                        feed_list[place_id][pos_enc_param_name] = pos_enc
            for feed_dict in feed_list:
                feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
            #outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
            #                     feed=feed_list)

            outs = train_exe.run(
                fetch_list=[sum_cost.name, token_num.name] if batch_id %
                100 == 0 else [],
                feed=feed_list)

            if batch_id % 100 == 0 and batch_id > 0:
                sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                    outs[1])
                total_sum_cost = sum_cost_val.sum(
                )  # sum the cost from multi-devices
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                logging.info(
                    "epoch: %d, batch: %d, avg loss: %f, normalized loss: %f,"
                    " ppl: %f" %
                    (pass_id, batch_id, total_avg_cost, total_avg_cost -
                     loss_normalizer, np.exp([min(total_avg_cost, 100)])))

                logging.info("speed: {0} batch/s".format(
                    100.0 / (time.time() - avg_batch_time)))
            """
            if batch_id > 0 and batch_id % 1000 == 0:
                fluid.io.save_persistables(
                    exe,
                    os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint"))
            """
            init = True

            if batch_id % 100 == 0 and batch_id > 0:
                avg_batch_time = time.time()

        time_consumed = time.time() - pass_start_time
        # Validate and save the model for inference.
        if args.val_file_pattern is not None:
            val_avg_cost, val_ppl = test()
            logging.info(
                "epoch: %d, val avg loss: %f, val normalized loss: %f, val ppl: %f,"
                " consumed %fs" % (pass_id, val_avg_cost, val_avg_cost -
                                   loss_normalizer, val_ppl, time_consumed))
        else:
            logging.info("epoch: %d, consumed %fs" % (pass_id, time_consumed))
        fluid.io.save_persistables(
            exe,
            os.path.join(TrainTaskConfig.ckpt_dir,
                         "pass_" + str(pass_id) + ".checkpoint"))
        fluid.io.save_inference_model(
            os.path.join(TrainTaskConfig.model_dir,
                         "pass_" + str(pass_id) + ".infer.model"),
            data_input_names[:-2] + util_input_names, [predict], exe)
    if args.enable_ce:  # For CE
        print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost))
        print("kpis\ttest_cost_card%d\t%f" % (dev_count, val_avg_cost))
        print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
Exemplo n.º 6
0
def main():
    """
    model train
    """
    is_local = os.getenv("PADDLE_IS_LOCAL", "0")
    if is_local == '0':
        args.local = False
    else:
        args.local = True
    # init
    place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
    if training_role == "PSERVER":
        place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,  
        ModelHyperParams.src_pad_idx,  ModelHyperParams.trg_pad_idx,
        ModelHyperParams.pos_pad_idx)

    warmup_steps = get_var("warmup_steps", value=TrainTaskConfig.warmup_steps)
    d_model = get_var("d_model", value=ModelHyperParams.d_model)

    lr_decay = fluid.layers\
        .learning_rate_scheduler\
        .noam_decay(d_model, warmup_steps)

    optimizer = fluid.optimizer.Adam(
        learning_rate = lr_decay,
        beta1=TrainTaskConfig.beta1,
        beta2=TrainTaskConfig.beta2,
        epsilon=TrainTaskConfig.eps)
    optimize_ops, params_grads = optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)


    # Program to do validation.
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program([avg_cost])

    def test(exe):
        test_total_cost = 0
        test_total_token = 0
        for batch_id, data in enumerate(test_reader()):
            data_input = prepare_batch_input(
                data, encoder_input_data_names + decoder_input_data_names[:-1] +
                label_data_names, ModelHyperParams.eos_idx,
                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                ModelHyperParams.d_model)
            test_sum_cost, test_token_num = exe.run(
                inference_program,
                feed=data_input,
                fetch_list=[sum_cost, token_num],
                use_program_cache=True)
            test_total_cost += test_sum_cost
            test_total_token += test_token_num
        test_avg_cost = test_total_cost / test_total_token
        test_ppl = np.exp([min(test_avg_cost, 100)])
        return test_avg_cost, test_ppl

    def train_loop(exe, trainer_prog):
        for pass_id in xrange(args.pass_num):
            ts = time.time()
            total = 0
            pass_start_time = time.time()
            #print len(train_reader)
            for batch_id, data in enumerate(train_reader):
                #print len(data)
                if len(data) != args.batch_size:
                    continue

                total += len(data)
                start_time = time.time()
                data_input = prepare_batch_input(
                    data, encoder_input_data_names + decoder_input_data_names[:-1] +
                    label_data_names, ModelHyperParams.eos_idx,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                
                outs = exe.run(trainer_prog,
                               feed=data_input,
                               fetch_list=[sum_cost, avg_cost],
                               use_program_cache=True)
                sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
                print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f, speed: %.2f" %
                      (pass_id, batch_id, sum_cost_val, avg_cost_val,
                       np.exp([min(avg_cost_val[0], 100)]), 
                       len(data) / (time.time() - start_time)))

                if args.test_save:
                    if batch_id == args.exit_batch_id:
                        print("batch_id: %d exit!" % batch_id)
                        break

            # Validate and save the model for inference.
            # val_avg_cost, val_ppl = test(exe)
            val_avg_cost, val_ppl = 0,0
            pass_end_time = time.time()
            time_consumed = pass_end_time - pass_start_time
            print("pass_id = %s time_consumed = %s val_avg_cost=%f val_ppl=%f speed: %.2f" % \
                  (str(pass_id), str(time_consumed), \
                     val_avg_cost, val_ppl, total / (time.time() - ts)))

            fluid.io.save_inference_model(
                os.path.join(args.model_path,
                             "pass_" + str(pass_id) + "_" + str(args.task_index) + ".infer.model"),
                encoder_input_data_names + decoder_input_data_names[:-1],
                [predict], exe)

            if args.test_save:
                break

    if args.local:
        # Initialize the parameters.
        print("local start_up:")
        exe.run(fluid.framework.default_startup_program())
        #print(debuger.pprint_program_codes(fluid.framework.default_startup_program()))
        for pos_enc_param_name in pos_enc_param_names:
            #print("pos_enc_param_name:", pos_enc_param_name)
            pos_enc_param = fluid.global_scope().find_var(
                pos_enc_param_name).get_tensor()
            pos_enc_param.set(
                position_encoding_init(ModelHyperParams.max_length + 1,
                                       ModelHyperParams.d_model), place)
         
        #print "./nist06n/data-%d/part-*" % (args.task_index),
        train_reader = data_util.DataLoader(
          src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict",
          trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict",
          fpattern="./train/*" % (args.task_index),
          batch_size=args.batch_size,
          token_batch_size=TrainTaskConfig.token_batch_size,
          sort_by_length=TrainTaskConfig.sort_by_length,
          shuffle=True)

        train_loop(exe, fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id,
            pservers=pserver_endpoints,
            trainers=trainers)
             
        if training_role == "PSERVER":
            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
                "PADDLE_PORT")
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)

            if args.save_graph: 
                block_no=0
                for t in pserver_startup.blocks: 
                    block_name="pserver_startup_block_%04d" % block_no
                    print block_name
                    print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot"))
                    block_no+=1

                block_no=0
                for t in pserver_prog.blocks:
                    block_name="pserver_prog_block_%04d" % block_no
                    print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot"))
                    block_no+=1

            print "begin run"
            exe.run(pserver_startup)#, save_program_to_file="./pserver_startup.desc")
            exe.run(pserver_prog)#, save_program_to_file="./pserver_loop.desc")
        elif training_role == "TRAINER":
            # Parameter initialization
            exe.run(fluid.default_startup_program())

            #print("cluster start_up:")

            for pos_enc_param_name in pos_enc_param_names:
                #print("pos_enc_param_name:", pos_enc_param_name)
                pos_enc_param = fluid.global_scope().find_var(
                    pos_enc_param_name).get_tensor()
                pos_enc_param.set(
                    position_encoding_init(ModelHyperParams.max_length + 1,
                                           ModelHyperParams.d_model), place)

            train_reader = data_util.DataLoader(
                src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict",
                trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict",
                fpattern="./train/part-*",
                batch_size=args.batch_size,
                token_batch_size=TrainTaskConfig.token_batch_size,
                sort_by_length=TrainTaskConfig.sort_by_length,
                shuffle=True)

            
            trainer_prog = t.get_trainer_program()
            train_loop(exe, trainer_prog)
        else:
            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
Exemplo n.º 7
0
    def train_loop(exe, train_progm):
        def read_multiple(reader,
                          count=dev_count if args.use_token_batch else 1,
                          clip_last=True):
            """
            Stack data from reader for multi-devices.
            """
            def __impl__():
                res = []
                for item in reader():
                    res.append(item)
                    if len(res) == count:
                        yield res
                        res = []
                if len(res) == count:
                    yield res
                elif not clip_last:
                    data = []
                    for item in res:
                        data += item
                    if len(data) > count:
                        inst_num_per_part = len(data) // count
                        yield [
                            data[inst_num_per_part * i:inst_num_per_part *
                                 (i + 1)] for i in range(count)
                        ]

            return __impl__

        def split_data(data, num_part=dev_count):
            """
            Split data for each device.
            """
            if len(data) == num_part:
                return data
            data = data[0]
            inst_num_per_part = len(data) // num_part
            return [
                data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
                for i in range(num_part)
            ]

        # Initialize the parameters.
        if TrainTaskConfig.ckpt_path:
            fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
            lr_scheduler.current_steps = TrainTaskConfig.start_step
        else:
            print "init fluid.framework.default_startup_program"
            exe.run(fluid.framework.default_startup_program())

        train_data = reader.DataReader(
            src_vocab_fpath=args.src_vocab_fpath,
            trg_vocab_fpath=args.trg_vocab_fpath,
            fpattern=args.train_file_pattern,
            use_token_batch=args.use_token_batch,
            batch_size=args.batch_size *
            (1 if args.use_token_batch else dev_count),
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            start_mark=args.special_token[0],
            end_mark=args.special_token[1],
            unk_mark=args.special_token[2],
            clip_last_batch=False)

        train_data = read_multiple(reader=train_data.batch_generator)
        build_strategy = fluid.BuildStrategy()
        # Since the token number differs among devices, customize gradient scale to
        # use token average cost among multi-devices. and the gradient scale is
        # `1 / token_number` for average cost.
        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
        #'''
        train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                           loss_name=sum_cost.name,
                                           main_program=train_progm,
                                           build_strategy=build_strategy)

        #'''

        def test_context():
            # Context to do validation.
            test_program = train_progm.clone()
            with fluid.program_guard(test_program):
                test_program = fluid.io.get_inference_program([avg_cost])

            val_data = reader.DataReader(
                src_vocab_fpath=args.src_vocab_fpath,
                trg_vocab_fpath=args.trg_vocab_fpath,
                fpattern=args.val_file_pattern,
                use_token_batch=args.use_token_batch,
                batch_size=args.batch_size *
                (1 if args.use_token_batch else dev_count),
                pool_size=args.pool_size,
                sort_type=args.sort_type,
                start_mark=args.special_token[0],
                end_mark=args.special_token[1],
                unk_mark=args.special_token[2],
                clip_last_batch=False,
                shuffle=False,
                shuffle_batch=False)

            test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                              main_program=test_program,
                                              share_vars_from=train_exe)

            def test(exe=test_exe):
                test_total_cost = 0
                test_total_token = 0
                test_data = read_multiple(reader=val_data.batch_generator)
                for batch_id, data in enumerate(test_data()):
                    feed_list = []
                    for place_id, data_buffer in enumerate(split_data(data)):
                        data_input_dict, util_input_dict, _ = prepare_batch_input(
                            data_buffer, data_input_names, util_input_names,
                            ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                            ModelHyperParams.n_head, ModelHyperParams.d_model)
                        feed_list.append(
                            dict(data_input_dict.items() +
                                 util_input_dict.items()))

                    outs = exe.run(feed=feed_list,
                                   fetch_list=[sum_cost.name, token_num.name])
                    sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                        outs[1])
                    test_total_cost += sum_cost_val.sum()
                    test_total_token += token_num_val.sum()
                test_avg_cost = test_total_cost / test_total_token
                test_ppl = np.exp([min(test_avg_cost, 100)])
                return test_avg_cost, test_ppl

            return test

        if args.val_file_pattern is not None:
            test = test_context()

        data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                                 -1] + label_data_input_fields
        util_input_names = encoder_util_input_fields + decoder_util_input_fields
        init = False
        for pass_id in xrange(TrainTaskConfig.pass_num):
            pass_start_time = time.time()
            for batch_id, data in enumerate(train_data()):
                feed_list = []
                total_num_token = 0
                #lr_rate = lr_scheduler.update_learning_rate()
                for place_id, data_buffer in enumerate(split_data(data)):
                    data_input_dict, util_input_dict, num_token = prepare_batch_input(
                        data_buffer, data_input_names, util_input_names,
                        ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                        ModelHyperParams.n_head, ModelHyperParams.d_model)
                    total_num_token += num_token
                    feed_list.append(
                        dict(data_input_dict.items() +
                             util_input_dict.items()))

                    if not init:
                        for pos_enc_param_name in pos_enc_param_names:
                            pos_enc = position_encoding_init(
                                ModelHyperParams.max_length + 1,
                                ModelHyperParams.d_model)
                            feed_list[place_id][pos_enc_param_name] = pos_enc
                for feed_dict in feed_list:
                    feed_dict[
                        sum_cost.name +
                        "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray(
                            [1.], dtype="float32")
                outs = train_exe.run(
                    fetch_list=[sum_cost.name, token_num.name], feed=feed_list)
                #outs = exe.run(train_progm,fetch_list=[sum_cost.name, token_num.name],feed=feed_list[0])
                sum_cost_val, token_num_val = np.array(outs[0]), np.array(
                    outs[1])
                total_sum_cost = sum_cost_val.sum(
                )  # sum the cost from multi-devices
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                print(
                    "epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f"
                    % (pass_id, batch_id, total_sum_cost, total_avg_cost,
                       np.exp([min(total_avg_cost, 100)])))
                init = True
            # Validate and save the model for inference.
            print("epoch: %d, " % pass_id +
                  ("val avg loss: %f, val ppl: %f, " %
                   test() if args.val_file_pattern is not None else "") +
                  "consumed %fs" % (time.time() - pass_start_time))
            fluid.io.save_persistables(
                exe,
                os.path.join(TrainTaskConfig.ckpt_dir,
                             "pass_" + str(pass_id) + ".checkpoint"))
            fluid.io.save_inference_model(
                os.path.join(TrainTaskConfig.model_dir,
                             "pass_" + str(pass_id) + ".infer.model"),
                data_input_names[:-2] + util_input_names, [predict], exe)
Exemplo n.º 8
0
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
               token_num, predict):
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        print "init fluid.framework.default_startup_program"
        exe.run(fluid.framework.default_startup_program())

    train_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.train_file_pattern,
        use_token_batch=args.use_token_batch,
        batch_size=args.batch_size *
        (1 if args.use_token_batch else dev_count),
        pool_size=args.pool_size,
        sort_type=args.sort_type,
        shuffle=args.shuffle,
        shuffle_batch=args.shuffle_batch,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
        clip_last_batch=False)
    train_data = read_multiple(reader=train_data.batch_generator,
                               count=dev_count if args.use_token_batch else 1)

    build_strategy = fluid.BuildStrategy()
    # Since the token number differs among devices, customize gradient scale to
    # use token average cost among multi-devices. and the gradient scale is
    # `1 / token_number` for average cost.
    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
    train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       loss_name=sum_cost.name,
                                       main_program=train_progm,
                                       build_strategy=build_strategy)

    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields

    if args.val_file_pattern is not None:
        test = test_context(train_progm, avg_cost, train_exe, dev_count,
                            data_input_names, util_input_names, sum_cost,
                            token_num)

    init = False
    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            feed_list = []
            total_num_token = 0
            for place_id, data_buffer in enumerate(
                    split_data(data, num_part=dev_count)):
                data_input_dict, util_input_dict, num_token = prepare_batch_input(
                    data_buffer, data_input_names, util_input_names,
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                total_num_token += num_token
                feed_kv_pairs = data_input_dict.items(
                ) + util_input_dict.items()
                if args.local:
                    lr_rate = lr_scheduler.update_learning_rate()
                    feed_kv_pairs += {
                        lr_scheduler.learning_rate.name: lr_rate
                    }.items()
                feed_list.append(dict(feed_kv_pairs))

                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        pos_enc = position_encoding_init(
                            ModelHyperParams.max_length + 1,
                            ModelHyperParams.d_model)
                        feed_list[place_id][pos_enc_param_name] = pos_enc
            for feed_dict in feed_list:
                feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
            outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name],
                                 feed=feed_list)
            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
            total_sum_cost = sum_cost_val.sum(
            )  # sum the cost from multi-devices
            total_token_num = token_num_val.sum()
            total_avg_cost = total_sum_cost / total_token_num
            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                  (pass_id, batch_id, total_sum_cost, total_avg_cost,
                   np.exp([min(total_avg_cost, 100)])))
            init = True
        # Validate and save the model for inference.
        print("epoch: %d, " % pass_id +
              ("val avg loss: %f, val ppl: %f, " %
               test() if args.val_file_pattern is not None else "") +
              "consumed %fs" % (time.time() - pass_start_time))
        fluid.io.save_persistables(
            exe,
            os.path.join(TrainTaskConfig.ckpt_dir,
                         "pass_" + str(pass_id) + ".checkpoint"))
        fluid.io.save_inference_model(
            os.path.join(TrainTaskConfig.model_dir,
                         "pass_" + str(pass_id) + ".infer.model"),
            data_input_names[:-2] + util_input_names, [predict], exe)
Exemplo n.º 9
0
def main():
    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size + 0,
        ModelHyperParams.trg_vocab_size + 0, ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer, ModelHyperParams.n_head,
        ModelHyperParams.d_key, ModelHyperParams.d_value,
        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)

    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps, place,
                                         TrainTaskConfig.learning_rate)
    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)

    train_data = paddle.batch(paddle.reader.shuffle(nist_data_provider.train(
        "data", ModelHyperParams.src_vocab_size,
        ModelHyperParams.trg_vocab_size),
                                                    buf_size=100000),
                              batch_size=TrainTaskConfig.batch_size)

    # Initialize the parameters.
    exe.run(fluid.framework.default_startup_program())
    for pos_enc_param_name in pos_enc_param_names:
        pos_enc_param = fluid.global_scope().find_var(
            pos_enc_param_name).get_tensor()
        pos_enc_param.set(
            position_encoding_init(ModelHyperParams.max_length + 1,
                                   ModelHyperParams.d_model), place)

    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            data_input = prepare_batch_input(
                data, encoder_input_data_names +
                decoder_input_data_names[:-1] + label_data_names,
                ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx,
                ModelHyperParams.n_head, ModelHyperParams.d_model)
            lr_scheduler.update_learning_rate(data_input)
            outs = exe.run(fluid.framework.default_main_program(),
                           feed=data_input,
                           fetch_list=[sum_cost, avg_cost],
                           use_program_cache=True)
            sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
                  (pass_id, batch_id, sum_cost_val, avg_cost_val,
                   np.exp([min(avg_cost_val[0], 100)])))
        pass_end_time = time.time()
        time_consumed = pass_end_time - pass_start_time
        print("pass_id = " + str(pass_id) + " time_consumed = " +
              str(time_consumed))
        fluid.io.save_inference_model(
            os.path.join(TrainTaskConfig.model_dir,
                         "pass_" + str(pass_id) + ".infer.model"),
            encoder_input_data_names + decoder_input_data_names[:-1],
            [predict], exe)
Exemplo n.º 10
0
def do_predict(args):
    if args.use_cuda:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    # define the data generator
    '''
    # old reader
    processor = reader.DataProcessor(fpattern=args.predict_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=False,
                                     batch_size=args.batch_size,
                                     device_count=1,
                                     pool_size=args.pool_size,
                                     sort_type=reader.SortType.NONE,
                                     shuffle=False,
                                     shuffle_batch=False,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head)
    '''
    processor = reader.DataProcessor(fpattern=args.predict_file,
                                     src_vocab_fpath=args.src_vocab_fpath,
                                     trg_vocab_fpath=args.trg_vocab_fpath,
                                     token_delimiter=args.token_delimiter,
                                     use_token_batch=False,
                                     batch_size=args.batch_size,
                                     device_count=1,
                                     pool_size=args.pool_size,
                                     sort_type=reader.SortType.NONE,
                                     shuffle=False,
                                     shuffle_batch=False,
                                     only_src=args.only_src,
                                     start_mark=args.special_token[0],
                                     end_mark=args.special_token[1],
                                     unk_mark=args.special_token[2],
                                     max_length=args.max_length,
                                     n_head=args.n_head,
                                     stream=args.stream,
                                     src_bpe_dict=args.src_bpe_dict)
    batch_generator = processor.data_generator(phase="predict", place=place)
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()
    trg_idx2word = reader.DataProcessor.load_dict(
        dict_path=args.trg_vocab_fpath, reverse=True)

    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = processor.get_vocab_summary()

    with fluid.dygraph.guard(place):
        # define data loader
        test_loader = fluid.io.DataLoader.from_generator(capacity=10)
        test_loader.set_batch_generator(batch_generator, places=place)

        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)

        # load the trained model
        assert args.init_from_params, (
            "Please set init_from_params to load the infer model.")
        model_dict, _ = fluid.load_dygraph(
            os.path.join(args.init_from_params, "transformer"))
        # to avoid a longer length than training, reset the size of position
        # encoding to max_length
        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        transformer.load_dict(model_dict)

        # set evaluate mode
        transformer.eval()

        f = open(args.output_file, "wb")

        detok = MosesDetokenizer(lang='en')
        detc = MosesDetruecaser()

        for input_data in test_loader():
            if args.stream:
                (src_word, src_pos, src_slf_attn_bias, trg_word,
                 trg_src_attn_bias, real_read) = input_data
            else:
                (src_word, src_pos, src_slf_attn_bias, trg_word,
                 trg_src_attn_bias) = input_data

            finished_seq, finished_scores = transformer.beam_search(
                src_word,
                src_pos,
                src_slf_attn_bias,
                trg_word,
                trg_src_attn_bias,
                bos_id=args.bos_idx,
                eos_id=args.eos_idx,
                beam_size=args.beam_size,
                max_len=args.max_out_len,
                waitk=args.waitk,
                stream=args.stream)
            finished_seq = finished_seq.numpy()
            finished_scores = finished_scores.numpy()
            for idx, ins in enumerate(finished_seq):
                for beam_idx, beam in enumerate(ins):
                    if beam_idx >= args.n_best: break
                    id_list = post_process_seq(beam, args.bos_idx,
                                               args.eos_idx)
                    word_list = [trg_idx2word[id] for id in id_list]

                    if args.stream:
                        if args.waitk > 0:
                            # for wait-k models, wait k words in the beginning
                            word_list = [b''] * (args.waitk - 1) + word_list
                        else:
                            # for full sentence model, wait until the end
                            word_list = [b''] * (len(real_read[idx].numpy()) -
                                                 1) + word_list

                        final_output = []
                        real_output = []
                        _read = real_read[idx].numpy()
                        sent = ''
                        bpe_flag = False

                        for j in range(max(len(_read), len(word_list))):
                            # append number of reads at step j
                            r = _read[j] if j < len(_read) else 0
                            if r > 0:
                                final_output += [b''] * (r - 1)

                            # append number of writes at step j
                            w = word_list[j] if j < len(word_list) else b''
                            w = w.decode('utf-8')
                            real_output.append(w)

                            # if bpe_flag:
                            #     _sent = ('%s@@ %s'%(sent, w)).strip()
                            # else:
                            #     _sent = ('%s %s'%(sent, w)).strip()

                            _sent = ' '.join(real_output)

                            if len(_sent) > 0:

                                _sent += ' a'
                                _sent = ' '.join(_sent.split())

                                # if _sent.endswith('@@ a'):
                                #     bpe_flag = True
                                # else:
                                #     bpe_flag = False

                                _sent = _sent.replace('@@ ', '')
                                _sent = detok.detokenize(_sent.split())
                                _sent = detc.detruecase(_sent)
                                _sent = ' '.join(_sent)
                                _sent = _sent[:-1].strip()

                            incre = _sent[len(sent):]
                            #print('_sent0:', _sent)
                            sent = _sent
                            #print('sent:', sent)

                            if r > 0:
                                # if there is read, append a word to write
                                # final_output.append(w)
                                final_output.append(str.encode(incre))
                            else:
                                # if there is no read, append word to the final write
                                if j >= len(word_list):
                                    break
                                # final_output[-1] += b' '+w
                                final_output[-1] += str.encode(incre)

                            #print(final_output)
                            #print('incre:', incre)
                            #print('_sent1:', _sent)
                            # f.write(bytes('part:'+_sent+'\n'))

                        sequence = b"\n".join(final_output) + b" \n"
                        f.write(sequence)
                        # embed()
                    else:
                        sequence = b" ".join(word_list) + b"\n"
                        f.write(sequence)
                    f.flush()