def do_predict(args): paddle.enable_static() place = "gpu" place = paddle.set_device(place) reader.adapt_vocab_size(args) test_program = paddle.static.Program() startup_program = paddle.static.Program() with paddle.static.program_guard(test_program, startup_program): src_word = paddle.static.data(name="src_word", shape=[None, None], dtype="int64") # Define model transformer = FasterTransformer( src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, num_encoder_layers=args.n_layer, num_decoder_layers=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy=args.decoding_strategy, beam_size=args.beam_size, max_out_len=args.max_out_len, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding, rel_len=args.use_rel_len, alpha=args.alpha) finished_seq = transformer(src_word=src_word) test_program = test_program.clone(for_test=True) exe = paddle.static.Executor(place) exe.run(startup_program) # Load checkpoint. transformer.export_params(init_from_params=os.path.join( args.init_from_params, "transformer.pdparams"), place=place) paddle.static.save_inference_model(os.path.join(args.inference_model_dir, "transformer"), feed_vars=src_word, fetch_vars=finished_seq, executor=exe, program=test_program)
def do_predict(args): place = "gpu" place = paddle.set_device(place) # Define model transformer = FasterTransformer( src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, num_encoder_layers=args.n_layer, num_decoder_layers=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy=args.decoding_strategy, beam_size=args.beam_size, topk=args.topk, topp=args.topp, max_out_len=args.max_out_len, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding, enable_faster_encoder=args.enable_faster_encoder, use_fp16_encoder=args.use_fp16_encoder) # Set evaluate mode transformer.eval() if args.enable_faster_encoder: transformer = enable_faster_encoder(transformer, use_fp16=args.use_fp16_encoder) src_word = generate_src_word(batch_size=args.infer_batch_size, vocab_size=args.src_vocab_size, max_length=args.max_length, eos_idx=args.eos_idx, pad_idx=args.bos_idx) with paddle.no_grad(): for i in range(100): # For warmup. if 50 == i: paddle.device.cuda.synchronize(place) start = time.time() transformer(src_word=src_word) paddle.device.cuda.synchronize(place) logger.info("Average test time for encoder-decoding is %f ms" % ((time.time() - start) / 50 * 1000))
def do_predict(args): place = "gpu" place = paddle.set_device(place) reader.adapt_vocab_size(args) # Define model transformer = FasterTransformer( src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, num_encoder_layers=args.n_layer, num_decoder_layers=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy=args.decoding_strategy, beam_size=args.beam_size, max_out_len=args.max_out_len, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding, enable_faster_encoder=args.enable_faster_encoder, use_fp16_encoder=args.use_fp16_encoder, rel_len=args.use_rel_len, alpha=args.alpha) # Set evaluate mode transformer.eval() # Load checkpoint. transformer.load(init_from_params=os.path.join(args.init_from_params, "transformer.pdparams")) # Convert dygraph model to static graph model transformer = paddle.jit.to_static( transformer, input_spec=[ # src_word paddle.static.InputSpec(shape=[None, None], dtype="int64"), # trg_word # Support exporting model which support force decoding # NOTE: Data type MUST be int32 ! # paddle.static.InputSpec( # shape=[None, None], dtype="int32") ]) # Save converted static graph model paddle.jit.save(transformer, os.path.join(args.inference_model_dir, "transformer")) logger.info("Transformer has been saved to {}".format( args.inference_model_dir))
def do_predict(args): place = "gpu" paddle.set_device(place) # Define data loader test_loader, to_tokens = reader.create_infer_loader(args) # Define model transformer = FasterTransformer( src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, n_layer=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy="beam_search", beam_size=args.beam_size, max_out_len=args.max_out_len, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding) # Set evaluate mode transformer.eval() # Load checkpoint. transformer.load(init_from_params=os.path.join(args.init_from_params, "transformer.pdparams")) f = open(args.output_file, "w") with paddle.no_grad(): for (src_word, ) in test_loader: finished_seq = transformer(src_word=src_word) finished_seq = finished_seq.numpy().transpose([1, 2, 0]) for ins in finished_seq: for beam_idx, beam in enumerate(ins): if beam_idx >= args.n_best: break id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) word_list = to_tokens(id_list) sequence = " ".join(word_list) + "\n" f.write(sequence)
def do_predict(args): place = "gpu" place = paddle.set_device(place) # Define model transformer = FasterTransformer(src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, num_encoder_layers=args.n_layer, num_decoder_layers=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy=args.decoding_strategy, beam_size=args.beam_size, topk=args.topk, topp=args.topp, max_out_len=args.max_out_len, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding) # Set evaluate mode transformer.eval() enc_output = paddle.randn( [args.infer_batch_size, args.max_length, args.d_model]) if args.use_fp16_decoding: enc_output = paddle.cast(enc_output, "float16") mem_seq_len = paddle.randint(1, args.max_length + 1, shape=[args.infer_batch_size], dtype="int32") with paddle.no_grad(): for i in range(100): # For warmup. if 50 == i: start = time.time() transformer.decoding(enc_output=enc_output, memory_seq_lens=mem_seq_len) logger.info("Average test time for decoding is %f ms" % ((time.time() - start) / 50 * 1000))
def do_predict(args): place = "gpu" place = paddle.set_device(place) # Define data loader # NOTE: Data yielded by DataLoader may be on CUDAPinnedPlace, # but custom op doesn't support CUDAPinnedPlace. Hence, # disable using CUDAPinnedPlace in DataLoader. paddle.fluid.reader.use_pinned_memory(False) test_loader, to_tokens = reader.create_infer_loader(args) # Define model transformer = FasterTransformer( src_vocab_size=args.src_vocab_size, trg_vocab_size=args.trg_vocab_size, max_length=args.max_length + 1, num_encoder_layers=args.n_layer, num_decoder_layers=args.n_layer, n_head=args.n_head, d_model=args.d_model, d_inner_hid=args.d_inner_hid, dropout=args.dropout, weight_sharing=args.weight_sharing, bos_id=args.bos_idx, eos_id=args.eos_idx, decoding_strategy=args.decoding_strategy, beam_size=args.beam_size, max_out_len=args.max_out_len, diversity_rate=args.diversity_rate, decoding_lib=args.decoding_lib, use_fp16_decoding=args.use_fp16_decoding, enable_faster_encoder=args.enable_faster_encoder, use_fp16_encoder=args.use_fp16_encoder) # Set evaluate mode transformer.eval() # Load checkpoint. transformer.load(init_from_params=os.path.join(args.init_from_params, "transformer.pdparams")) f = open(args.output_file, "w") with paddle.no_grad(): if args.profile: import time start = time.time() for (src_word, ) in test_loader: finished_seq = transformer(src_word=src_word) if not args.profile: if args.decoding_strategy == "beam_search" or args.decoding_strategy == "beam_search_v2": finished_seq = finished_seq.numpy().transpose([1, 2, 0]) elif args.decoding_strategy == "topk_sampling" or args.decoding_strategy == "topp_sampling": finished_seq = np.expand_dims( finished_seq.numpy().transpose([1, 0]), axis=1) for ins in finished_seq: for beam_idx, beam in enumerate(ins): if beam_idx >= args.n_best: break id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) word_list = to_tokens(id_list) sequence = " ".join(word_list) + "\n" f.write(sequence) if args.profile: if args.decoding_strategy == "beam_search" or args.decoding_strategy == "beam_search_v2": logger.info( "Setting info: batch size: {}, beam size: {}, use fp16: {}. ". format(args.infer_batch_size, args.beam_size, args.use_fp16_decoding)) elif args.decoding_strategy == "topk_sampling": logger.info( "Setting info: batch size: {}, topk: {}, use fp16: {}. ". format(args.infer_batch_size, args.topk, args.use_fp16_decoding)) elif args.decoding_strategy == "topp_sampling": logger.info( "Setting info: batch size: {}, topp: {}, use fp16: {}. ". format(args.infer_batch_size, args.topp, args.use_fp16_decoding)) paddle.fluid.core._cuda_synchronize(place) logger.info("Average time latency is {} ms/batch. ".format(( time.time() - start) / len(test_loader) * 1000))