default=os.path.join('~', 'cased_L-12_H-768_A-12/'), help='Path to Tensorflow checkpoint folder. ' 'Default is /home/ubuntu/cased_L-12_H-768_A-12/') parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'), help='Path to output folder. The folder must exist. ' 'Default is /home/ubuntu/output/') parser.add_argument('--debug', action='store_true', help='debugging mode') args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO) logging.info(args) # convert vocabulary vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt') vocab, reserved_token_idx_map = convert_vocab(vocab_path) # vocab serialization tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp')) with open(tmp_file_path, 'w') as f: f.write(vocab.to_json()) hash_full, hash_short = get_hash(tmp_file_path) gluon_vocab_path = os.path.expanduser( os.path.join(args.out_dir, hash_short + '.vocab')) with open(gluon_vocab_path, 'w') as f: f.write(vocab.to_json()) logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full) # load tf model tf_checkpoint_file = os.path.expanduser(
def test(args: Namespace): cfg = json.load(open(args.config_path, 'r', encoding='UTF-8')) batch_size = 1 # for predicting one sentence. encoder = Encoder(cfg['vocab_input_size'], cfg['embedding_dim'], cfg['units'], batch_size, 0) decoder = Decoder(cfg['vocab_target_size'], cfg['embedding_dim'], cfg['units'], cfg['method'], batch_size, 0) optimizer = select_optimizer(cfg['optimizer'], cfg['learning_rate']) ckpt = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) manager = tf.train.CheckpointManager(ckpt, cfg['checkpoint_dir'], max_to_keep=3) ckpt.restore(manager.latest_checkpoint) while True: sentence = input( 'Input Sentence or If you want to quit, type Enter Key : ') if sentence == '': break sentence = re.sub(r"(\.\.\.|[?.!,¿])", r" \1 ", sentence) sentence = re.sub(r'[" "]+', " ", sentence) sentence = '<s> ' + sentence.lower().strip() + ' </s>' input_vocab = load_vocab('./data/', 'en') target_vocab = load_vocab('./data/', 'de') input_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer( filters='', oov_token='<unk>') input_lang_tokenizer.word_index = input_vocab target_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer( filters='', oov_token='<unk>') target_lang_tokenizer.word_index = target_vocab convert_vocab(input_lang_tokenizer, input_vocab) convert_vocab(target_lang_tokenizer, target_vocab) inputs = [ input_lang_tokenizer.word_index[i] if i in input_lang_tokenizer.word_index else input_lang_tokenizer.word_index['<unk>'] for i in sentence.split(' ') ] inputs = tf.keras.preprocessing.sequence.pad_sequences( [inputs], maxlen=cfg['max_len_input'], padding='post') inputs = tf.convert_to_tensor(inputs) result = '' enc_hidden = encoder.initialize_hidden_state() enc_cell = encoder.initialize_cell_state() enc_state = [[enc_hidden, enc_cell], [enc_hidden, enc_cell], [enc_hidden, enc_cell], [enc_hidden, enc_cell]] enc_output, enc_hidden = encoder(inputs, enc_state) dec_hidden = enc_hidden #dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<eos>']], 0) dec_input = tf.expand_dims([target_lang_tokenizer.word_index['<s>']], 1) print('dec_input:', dec_input) h_t = tf.zeros((batch_size, 1, cfg['embedding_dim'])) for t in range(int(cfg['max_len_target'])): predictions, dec_hidden, h_t = decoder(dec_input, dec_hidden, enc_output, h_t) # predeictions shape == (1, 50002) predicted_id = tf.argmax(predictions[0]).numpy() print('predicted_id', predicted_id) result += target_lang_tokenizer.index_word[predicted_id] + ' ' if target_lang_tokenizer.index_word[predicted_id] == '</s>': print('Early stopping') break dec_input = tf.expand_dims([predicted_id], 1) print('dec_input:', dec_input) print('<s> ' + result) print(sentence) sys.stdout.flush()