def test_model(sentence): if " " not in sentence: sentence = re.sub("", " ", sentence)[1:] # print(sentence) tokenizer = BertTokenizer(vocab_file="vocab.txt") input_data = tokenizer([sentence], return_tensors="tf", add_special_tokens=False) input_ids = input_data["input_ids"][0].numpy() input_ids = list(input_ids) input_ids.insert(0, tokenizer.get_vocab()["[CLS]"]) input_ids = tf.constant(input_ids)[None, :] # print(input_ids[0].numpy()) # exit() for i in range(100): predictions = model(input_ids=input_ids, training=False)[0] predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) if tf.equal(predicted_id[0].numpy(), [3]): break input_ids = tf.concat([input_ids, predicted_id], axis=-1) # print(input_ids) result = "".join(tokenizer.batch_decode(tf.squeeze(input_ids, axis=0))) result = result.split("|") up_sentence = result[0] up_sentence = up_sentence.split("]")[1] un_sentence = result[1] # print("上联:", up_sentence) # print("下联:", un_sentence) result = "上联:" + up_sentence + "\n\n" + "下联:" + un_sentence return result
s = s | set(seq) for e in s: f.write(str(e) + '\n') # %% user_train = pd.read_csv('data/train_preliminary/user.csv').sort_values( ['user_id'], ascending=(True, )) Y_gender = user_train['gender'].values Y_age = user_train['age'].values Y_gender = Y_gender - 1 Y_age = Y_age - 1 # Y_age = to_categorical(Y_age) # %% tokenizer = BertTokenizer('tmp/tmp.txt') print(tokenizer.get_vocab()) sample_txt = '456 1 23 456 89 89' tokenizer.tokenize(sample_txt) # %% encoding = tokenizer.encode_plus( sample_txt, max_length=32, add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', # Return PyTorch tensors ) encoding.keys()