def predict_word(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer, tgt_word: str, tgt_pos: int): # print('Template sentence: ', text) mask_positions = [] # insert mask tokens tokenized_text = tokenizer.tokenize(text) for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # normalize by softmax predictions = F.softmax(predictions, dim=2) # For the target word position, get probabilities for each word of interest normalized = predictions[0, tgt_pos, :] out_prob = normalized[tokenizer.vocab[tgt_word]].item() # Also, fill in all blanks by max prob, and print for inspection for mask_pos in mask_positions: predicted_index = torch.argmax(predictions[0, mask_pos, :]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] tokenized_text[mask_pos] = predicted_token for mask_pos in mask_positions: tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_" pred_sent = ' '.join(tokenized_text).replace(' ##', '') # print(pred_sent) return out_prob, pred_sent
continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad(): logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None) logits = logits.squeeze(0) probs = torch.softmax(logits, dim=-1) for idx, token in enumerate(tokens): if token == MASK: mask_cnt += 1 print('Top {} predictions for {}th {}:'.format( args.topk, mask_cnt, MASK)) topk_prob, topk_indices = torch.topk(probs[idx, :], args.topk) topk_tokens = bert_tokenizer.convert_ids_to_tokens( topk_indices.cpu().numpy()) for prob, tok in zip(topk_prob, topk_tokens): print('{} {}'.format(tok, prob)) predict_res.append(tok) print('='*80) cnt = correct_cnt = 0 for item1, item2 in zip(res, predict_res): if item1 == item2: correct_cnt = correct_cnt+1 print(item1+'对了!') cnt = cnt+1 print('correct rate is:%.2f' % (correct_cnt/cnt))
class BertWithJumanModel(): def __init__(self, bert_path, vocab_file_name="vocab.txt", use_cuda=False): # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) # CUDA-GPUを利用するかどうかのフラグ読み込み self.use_cuda = use_cuda def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "") # for Juman def paraphrase(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids( ["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) if self.use_cuda: # GPUの利用チェック、利用 generated_token_ids = generated_token_ids.to('cuda') self.model.to('cuda') # モデルを評価モードに変更 self.model.eval() with torch.no_grad(): for i in range(10): for j, _ in enumerate(tokens): # 文章のトークン1つをMASKに変換する # ヘッダは除くから、+1から masked_index = j + 1 pre_token = generated_token_ids[0, masked_index].item() generated_token_ids[ 0, masked_index] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(generated_token_ids) predictions = outputs[0] _, predicted_indexes = torch.topk( predictions[0, masked_index], k=5) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens( predicted_indexes.tolist()) print(predicted_tokens) predict_token = predicted_indexes.tolist()[0] # if pre_token == predict_token: # predict_token = predicted_indexes.tolist()[1] generated_token_ids[0, masked_index] = predict_token # idから文字列に変換、結合 sampled_sequence = [ self.bert_tokenizer.ids_to_tokens[token_id] for token_id in generated_token_ids[0].cpu().numpy() ] sampled_sequence = "".join([ token[2:] if token.startswith("##") else token for token in list( filter(lambda x: x != '[PAD]', sampled_sequence)) ]) logger.info( "sampled sequence: {}".format(sampled_sequence))
class Generater: def __init__(self, bert_path): vocab_file_name = 'vocab.txt' # 日本語文章をBERTに食わせるためにJumanを読み込む self.juman_tokenizer = JumanTokenizer() # 事前学習済みのBERTモデルを読み込む self.model = BertModel.from_pretrained(bert_path) # 事前学習済みのBERTモデルのTokenizerを読み込む self.bert_tokenizer = BertTokenizer(Path(bert_path) / vocab_file_name, do_lower_case=False, do_basic_tokenize=False) self.vocab_size = len(self.bert_tokenizer.vocab) # 事前学習済みのBERTモデルのMaskedLMタスクモデルを読み込む self.model = BertForMaskedLM.from_pretrained(bert_path) # 除外するヘッダ等トークン except_tokens = ["[MASK]", #"[PAD]", "[UNK]", "[CLS]", "[SEP]", "(", ")", "・", "/", "、", "。", "!", "?", "「", "」", "…", "’", "』", "『", ":", "※" ] self.except_ids = [self.bert_tokenizer.vocab[token] for token in except_tokens] # vocab_sizeのうち、except_ids以外は、利用する self.candidate_ids = [i for i in range(self.vocab_size) if i not in self.except_ids] def _preprocess_text(self, text): # 事前処理、テキストの半角スペースは削除 return text.replace(" ", "").replace('#', '') # for Juman def text2tokens(self, text): # テキストの半角スペースを削除する preprocessed_text = self._preprocess_text(text) # 日本語のテキストを分かち書きし、トークンリストに変換する tokens = self.juman_tokenizer.tokenize(preprocessed_text) # トークンを半角スペースで結合しstrに変換する bert_tokens = self.bert_tokenizer.tokenize(" ".join(tokens)) # テキストのサイズは128までなので、ヘッダ + トークン126個 + フッタを作成 # トークンをidに置換する ids = self.bert_tokenizer.convert_tokens_to_ids(["[CLS]"] + bert_tokens[:126] + ["[SEP]"]) # max_seq_len-2 generated_token_ids = torch.tensor(ids).reshape(1, -1) return generated_token_ids def tokens2text(self, tokens): sampled_sequence = [self.bert_tokenizer.ids_to_tokens[token_id] for token_id in tokens[0].cpu().numpy()] sampled_sequence = "".join( [ token[2:] if token.startswith("##") else token for token in list(filter(lambda x: x != '[PAD]' and x != '[CLS]' and x != '[SEP]', sampled_sequence)) ] ) return sampled_sequence def likelihood(self, tokens): outputs = self.model(tokens) predictions = outputs[0] score_sum = 0.0 for idx, scores in zip(tokens[0].tolist(), predictions[0].tolist()): score_sum += scores[idx] return score_sum def initialization_text(self, length=10): init_tokens = [] # ヘッダ init_tokens.append(self.bert_tokenizer.vocab["[CLS]"]) for _ in range(length): # ランダムに文字を選択 init_tokens.append(random.choice(self.candidate_ids)) # フッタ init_tokens.append(self.bert_tokenizer.vocab["[SEP]"]) return torch.tensor(init_tokens).reshape(1, -1) def scoring(self, tokens): return self.likelihood(tokens) + self.juman_tokenizer.tanka_score_subsets(self.tokens2text(tokens)) + self.juman_tokenizer.tanka_score_flow(self.tokens2text(tokens)) def select(self, l_tokens, size=5): scores = list(map(self.scoring, l_tokens)) print(sorted(scores, reverse=True)[:3]) selected = list(map( lambda x: x[0], sorted( list(zip(l_tokens, scores)), key=lambda x: x[1], reverse=True ) )) return selected def crossover(self, tokens_0, tokens_1): l_tokens_0 = tokens_0.numpy().reshape(-1).tolist() l_tokens_1 = tokens_1.numpy().reshape(-1).tolist() start = random.randint(1, len(l_tokens_0) - 3) end = random.randint(start, len(l_tokens_0) - 2) for num in range(start, end): l_tokens_0[num] = l_tokens_1[num] return torch.tensor(l_tokens_0).reshape(1, -1) def mutation(self, tokens, N=3): l_tokens = tokens.numpy().reshape(-1).tolist() for num in range(N): num = random.randint(1, len(l_tokens) - 2) l_tokens[num] = self.bert_tokenizer.vocab["[MASK]"] outputs = self.model(torch.tensor(l_tokens).reshape(1, -1)) predictions = outputs[0] _, predicted_indexes = torch.topk(predictions[0, num], k=10) # random_tokens = [random.choice(self.candidate_ids) for i in range(1)] random_tokens = [] predicted_indexes = list( set(predicted_indexes.tolist() + random_tokens) - set(self.except_ids) ) predicted_tokens = self.bert_tokenizer.convert_ids_to_tokens(predicted_indexes) predict_token = random.choice(predicted_indexes) l_tokens[num] = predict_token return torch.tensor(l_tokens).reshape(1, -1)