Exemplo n.º 1
0
def train_static(args, batch_generator):
    paddle.manual_seed(SEED)
    paddle.framework.random._manual_program_seed(SEED)
    train_prog = fluid.Program()
    startup_prog = fluid.Program()

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            # define input and reader
            input_field_names = util.encoder_data_input_fields + \
                                util.decoder_data_input_fields[:-1] + util.label_data_input_fields
            input_descs = util.get_input_descs(args)
            input_slots = [{
                "name": name,
                "shape": input_descs[name][0],
                "dtype": input_descs[name][1]
            } for name in input_field_names]
            input_field = util.InputField(input_slots)
            # Define DataLoader
            data_loader = fluid.io.DataLoader.from_generator(
                input_field.feed_list, capacity=60)
            data_loader.set_batch_generator(batch_generator, places=place)
            # define model
            transformer = Transformer(
                args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
                args.n_layer, args.n_head, args.d_key, args.d_value,
                args.d_model, args.d_inner_hid, args.prepostprocess_dropout,
                args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
                args.postprocess_cmd, args.weight_sharing, args.bos_idx,
                args.eos_idx)
            logits = transformer(*input_field.feed_list[:7])
            # define loss
            criterion = CrossEntropyCriterion(args.label_smooth_eps)
            lbl_word, lbl_weight = input_field.feed_list[7:]
            sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
                                                      lbl_weight)
            # define optimizer
            learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
                args.d_model, args.warmup_steps, args.learning_rate)
            optimizer = fluid.optimizer.Adam(
                learning_rate=learning_rate,
                beta1=args.beta1,
                beta2=args.beta2,
                epsilon=float(args.eps))
            optimizer.minimize(avg_cost)
            # the best cross-entropy value with label smoothing
            loss_normalizer = -((1. - args.label_smooth_eps) * np.log(
                (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(
                    args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
    step_idx = 0
    total_batch_num = 0
    avg_loss = []
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    for pass_id in range(args.epoch):
        batch_id = 0
        for feed_dict in data_loader:
            outs = exe.run(program=train_prog,
                           feed=feed_dict,
                           fetch_list=[sum_cost.name, token_num.name])
            if step_idx % args.print_step == 0:
                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
                    1])
                total_sum_cost = sum_cost_val.sum()
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                avg_loss.append(total_avg_cost)
                if step_idx == 0:
                    logging.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f" %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)])))
                    avg_batch_time = time.time()
                else:
                    logging.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f, speed: %.2f steps/s" %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)]),
                         args.print_step / (time.time() - avg_batch_time)))
                    avg_batch_time = time.time()
            batch_id += 1
            step_idx += 1
            total_batch_num = total_batch_num + 1
            if step_idx == STEP_NUM:
                if args.save_dygraph_model_path:
                    model_path = os.path.join(args.save_static_model_path,
                                              "transformer")
                    fluid.save(train_prog, model_path)
                break
    return np.array(avg_loss)
Exemplo n.º 2
0
def predict_dygraph(args, batch_generator):
    with fluid.dygraph.guard(place):
        paddle.manual_seed(SEED)
        paddle.framework.random._manual_program_seed(SEED)

        # define data loader
        test_loader = fluid.io.DataLoader.from_generator(capacity=10)
        test_loader.set_batch_generator(batch_generator, places=place)

        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)

        # load the trained model
        model_dict, _ = util.load_dygraph(
            os.path.join(args.save_dygraph_model_path, "transformer"))
        # to avoid a longer length than training, reset the size of position
        # encoding to max_length
        model_dict["encoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
            args.max_length + 1, args.d_model)
        transformer.load_dict(model_dict)

        # set evaluate mode
        transformer.eval()

        step_idx = 0
        speed_list = []
        for input_data in test_loader():
            (src_word, src_pos, src_slf_attn_bias, trg_word,
             trg_src_attn_bias) = input_data
            seq_ids, seq_scores = transformer.beam_search(
                src_word,
                src_pos,
                src_slf_attn_bias,
                trg_word,
                trg_src_attn_bias,
                bos_id=args.bos_idx,
                eos_id=args.eos_idx,
                beam_size=args.beam_size,
                max_len=args.max_out_len)
            seq_ids = seq_ids.numpy()
            seq_scores = seq_scores.numpy()
            if step_idx % args.print_step == 0:
                if step_idx == 0:
                    logging.info(
                        "Dygraph Predict: step_idx: %d, 1st seq_id: %d, 1st seq_score: %.2f"
                        % (step_idx, seq_ids[0][0][0], seq_scores[0][0]))
                    avg_batch_time = time.time()
                else:
                    speed = args.print_step / (time.time() - avg_batch_time)
                    speed_list.append(speed)
                    logging.info(
                        "Dygraph Predict: step_idx: %d, 1st seq_id: %d, 1st seq_score: %.2f, speed: %.3f steps/s"
                        % (step_idx, seq_ids[0][0][0], seq_scores[0][0], speed))
                    avg_batch_time = time.time()

            step_idx += 1
            if step_idx == STEP_NUM:
                break
        logging.info("Dygraph Predict:  avg_speed: %.4f steps/s" %
                     (np.mean(speed_list)))
        return seq_ids, seq_scores
Exemplo n.º 3
0
def predict_static(args, batch_generator):
    test_prog = fluid.Program()
    with fluid.program_guard(test_prog):
        paddle.manual_seed(SEED)
        paddle.framework.random._manual_program_seed(SEED)

        # define input and reader
        input_field_names = util.encoder_data_input_fields + util.fast_decoder_data_input_fields
        input_descs = util.get_input_descs(args, 'test')
        input_slots = [{
            "name": name,
            "shape": input_descs[name][0],
            "dtype": input_descs[name][1]
        } for name in input_field_names]

        input_field = util.InputField(input_slots)
        feed_list = input_field.feed_list
        loader = fluid.io.DataLoader.from_generator(
            feed_list=feed_list, capacity=10)

        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)

        out_ids, out_scores = transformer.beam_search(
            *feed_list,
            bos_id=args.bos_idx,
            eos_id=args.eos_idx,
            beam_size=args.beam_size,
            max_len=args.max_out_len)

    # This is used here to set dropout to the test mode.
    test_prog = test_prog.clone(for_test=True)

    # define the executor and program for training
    exe = fluid.Executor(place)

    util.load(test_prog,
              os.path.join(args.save_static_model_path, "transformer"), exe)

    loader.set_batch_generator(batch_generator, places=place)

    step_idx = 0
    speed_list = []
    for feed_dict in loader:
        seq_ids, seq_scores = exe.run(
            test_prog,
            feed=feed_dict,
            fetch_list=[out_ids.name, out_scores.name],
            return_numpy=True)
        if step_idx % args.print_step == 0:
            if step_idx == 0:
                logging.info(
                    "Static Predict: step_idx: %d, 1st seq_id: %d, 1st seq_score: %.2f,"
                    % (step_idx, seq_ids[0][0][0], seq_scores[0][0]))
                avg_batch_time = time.time()
            else:
                speed = args.print_step / (time.time() - avg_batch_time)
                speed_list.append(speed)
                logging.info(
                    "Static Predict: step_idx: %d, 1st seq_id: %d, 1st seq_score: %.2f, speed: %.3f steps/s"
                    % (step_idx, seq_ids[0][0][0], seq_scores[0][0], speed))
                avg_batch_time = time.time()

        step_idx += 1
        if step_idx == STEP_NUM:
            break
    logging.info("Static Predict:  avg_speed: %.4f steps/s" %
                 (np.mean(speed_list)))

    return seq_ids, seq_scores
Exemplo n.º 4
0
def train_dygraph(args, batch_generator):
    with fluid.dygraph.guard(place):
        if SEED is not None:
            paddle.manual_seed(SEED)
            paddle.framework.random._manual_program_seed(SEED)
        # define data loader
        train_loader = fluid.io.DataLoader.from_generator(capacity=10)
        train_loader.set_batch_generator(batch_generator, places=place)
        # define model
        transformer = Transformer(
            args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
            args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model,
            args.d_inner_hid, args.prepostprocess_dropout,
            args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
            args.postprocess_cmd, args.weight_sharing, args.bos_idx,
            args.eos_idx)
        # define loss
        criterion = CrossEntropyCriterion(args.label_smooth_eps)
        # define optimizer
        learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
            args.d_model, args.warmup_steps, args.learning_rate)
        # define optimizer
        optimizer = fluid.optimizer.Adam(
            learning_rate=learning_rate,
            beta1=args.beta1,
            beta2=args.beta2,
            epsilon=float(args.eps),
            parameter_list=transformer.parameters())
        # the best cross-entropy value with label smoothing
        loss_normalizer = -(
            (1. - args.label_smooth_eps) * np.log(
                (1. - args.label_smooth_eps)) + args.label_smooth_eps *
            np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
        ce_time = []
        ce_ppl = []
        avg_loss = []
        step_idx = 0
        for pass_id in range(args.epoch):
            pass_start_time = time.time()
            batch_id = 0
            for input_data in train_loader():
                (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
                 trg_slf_attn_bias, trg_src_attn_bias, lbl_word,
                 lbl_weight) = input_data
                logits = transformer(src_word, src_pos, src_slf_attn_bias,
                                     trg_word, trg_pos, trg_slf_attn_bias,
                                     trg_src_attn_bias)
                sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
                                                          lbl_weight)
                avg_cost.backward()
                optimizer.minimize(avg_cost)
                transformer.clear_gradients()
                if step_idx % args.print_step == 0:
                    total_avg_cost = avg_cost.numpy() * trainer_count
                    avg_loss.append(total_avg_cost[0])
                    if step_idx == 0:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f" %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)])))
                        avg_batch_time = time.time()
                    else:
                        logging.info(
                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                            "normalized loss: %f, ppl: %f, speed: %.2f steps/s"
                            %
                            (step_idx, pass_id, batch_id, total_avg_cost,
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)]),
                             args.print_step / (time.time() - avg_batch_time)))
                        ce_ppl.append(np.exp([min(total_avg_cost, 100)]))
                        avg_batch_time = time.time()
                batch_id += 1
                step_idx += 1
                if step_idx == STEP_NUM:
                    if args.save_dygraph_model_path:
                        model_dir = os.path.join(args.save_dygraph_model_path)
                        if not os.path.exists(model_dir):
                            os.makedirs(model_dir)
                        fluid.save_dygraph(
                            transformer.state_dict(),
                            os.path.join(model_dir, "transformer"))
                        fluid.save_dygraph(
                            optimizer.state_dict(),
                            os.path.join(model_dir, "transformer"))
                    break
            time_consumed = time.time() - pass_start_time
            ce_time.append(time_consumed)
        return np.array(avg_loss)