def predict(input_text, net_trained, candidate_num=3, output_print=False): TEXT = pickle_load(PKL_FILE) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False) text = preprocessing_text(input_text) text = tokenizer_bert.tokenize(text) text.insert(0, "[CLS]") text.append("[SEP]") token_ids = torch.ones((max_length)).to(torch.int64) ids_list = list(map(lambda x: TEXT.vocab.stoi[x], text)) for i, index in enumerate(ids_list): token_ids[i] = index ids_list = token_ids.unsqueeze_(0) input = ids_list.to(device) input_mask = (input != 1) outputs, attention_probs = net_trained(input, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False, attention_show_flg=True) offset_tensor = torch.tensor(offset, device=device) outputs -= offset_tensor if output_print == True: print(outputs) _, preds = torch.topk(outputs, candidate_num) return preds
def mk_html(input, preds, normlized_weights, TEXT): "HTMLデータを作成する" tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False) # indexの結果を抽出 index = 0 sentence = input.squeeze_(0) # 文章 # torch.Size([1, 256]) > torch.Size([256]) pred = preds[0] # 予測 # 予測結果を文字に置き換え if pred == 0: pred_str = "Negative" else: pred_str = "Positive" # 表示用のHTMLを作成する html = '推論ラベル:{}<br><br>'.format(pred_str) # Self-Attentionの重みを可視化。Multi-Headが12個なので、12種類のアテンションが存在 for i in range(12): # indexのAttentionを抽出と規格化 # 0単語目[CLS]の、i番目のMulti-Head Attentionを取り出す # indexはミニバッチの何個目のデータかをしめす attens = normlized_weights[index, i, 0, :] attens /= attens.max() #html += '[BERTのAttentionを可視化_' + str(i+1) + ']<br>' for word, attn in zip(sentence, attens): # 単語が[SEP]の場合は文章が終わりなのでbreak if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]": break # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す #html += highlight(tokenizer_bert.convert_ids_to_tokens( # [word.numpy().tolist()])[0], attn) #html += "<br><br>" # 12種類のAttentionの平均を求める。最大値で規格化 all_attens = attens*0 # all_attensという変数を作成する for i in range(12): attens += normlized_weights[index, i, 0, :] attens /= attens.max() html += '[BERTのAttentionを可視化_ALL]<br>' for word, attn in zip(sentence, attens): # 単語が[SEP]の場合は文章が終わりなのでbreak if tokenizer_bert.convert_ids_to_tokens([word.numpy().tolist()])[0] == "[SEP]": break # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す html += highlight(tokenizer_bert.convert_ids_to_tokens( [word.numpy().tolist()])[0], attn) html += "<br><br>" return html
def __init__(self, data_dir=r'./', bert_dir=r'./pytorch_advanced/nlp_sentiment_bert/'): self.data_dir = data_dir self.bert_dir = bert_dir self.tokenizer_bert = BertTokenizer( vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt", do_lower_case=True) self.vocab_bert, self.ids_to_tokens_bert = load_vocab( vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt") config = get_config(file_path=self.bert_dir + "weights/bert_config.json") self.net_bert = BertModel(config) self.net_bert = set_learned_params(self.net_bert, weights_path=self.bert_dir + "weights/pytorch_model.bin")
def __init__(self, vocab_file, max_text_length=256, use_basic_form=False, mecab_dict=None): self.tokenizer = BertTokenizer( vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False) if mecab_dict is not None: self.tagger = MeCab.Tagger("-d {}".format(mecab_dict)) else: self.tagger = MeCab.Tagger("") self.text_field, self.label_field = self._prepare( max_text_length, use_basic_form) self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)
def __init__(self, vocab_file, max_text_length=256, **kwargs): do_normalize_text = kwargs["do_normalize_text"] if "do_normalize_text" in kwargs else False use_basic_form = kwargs["use_basic_form"] if "use_basic_form" in kwargs else False mecab_dict = kwargs["mecab_dict"] if "mecab_dict" in kwargs else None self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False, do_basic_tokenize=False) if mecab_dict is not None: self.tagger = MeCab.Tagger("-d {}".format(mecab_dict)) else: self.tagger = MeCab.Tagger("") self.text_field, self.label_field = self._prepare(max_text_length, do_normalize_text, use_basic_form) self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)
import os import io import string import re import sys import random import spacy import torchtext import mojimoji import string #import MeCab from torchtext.vocab import Vectors from utils.bert import BertTokenizer, load_vocab # 単語分割用のTokenizerを用意 tokenizer_bert = BertTokenizer(vocab_file="./vocab/vocab.txt", do_lower_case=False) def get_chABSA_DataLoaders_and_TEXT(max_length=256, batch_size=32): """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """ def preprocessing_text(text): # 半角・全角の統一 text = mojimoji.han_to_zen(text) # 改行、半角スペース、全角スペースを削除 text = re.sub('\r', '', text) text = re.sub('\n', '', text) text = re.sub(' ', '', text) text = re.sub(' ', '', text) # 数字文字の一律「0」化 text = re.sub(r'[0-9 0-9]+', '0', text) # 数字
# In[14]: # 動作確認 検証データのデータセットで確認 batch = next(iter(train_dl)) print("Textの形状=", batch.Text[0].shape) print("Labelの形状=", batch.Label.shape) print(batch.Text) print(batch.Label) # In[15]: # ミニバッチの1文目を確認してみる tokenizer_bert = BertTokenizer(vocab_file="./vocab/vocab.txt", do_lower_case=False) text_minibatch_1 = (batch.Text[0][1]).numpy() # IDを単語に戻す text = tokenizer_bert.convert_ids_to_tokens(text_minibatch_1) print(text) # # 2.BERTによるネガポジ分類モデル実装 # In[16]: from utils.bert import get_config, BertModel,BertForchABSA, set_learned_params
def DataLoader(max_length=256, batch_size=32): """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """ # 乱数のシードを設定 torch.manual_seed(0) np.random.seed(0) random.seed(0) # 単語分割用のTokenizerを用意 tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False) def preprocessing_text(text): # 半角・全角の統一 text = mojimoji.han_to_zen(text) # 改行、半角スペース、全角スペースを削除 text = re.sub('\r', '', text) text = re.sub('\n', '', text) text = re.sub(' ', '', text) text = re.sub(' ', '', text) text = re.sub("\"", '', text) # 数字文字の一律「0」化 text = re.sub(r'[0-9 0-9]+', '0', text) # 数字 # カンマ、ピリオド以外の記号をスペースに置換 for p in string.punctuation: if (p == ".") or (p == ","): continue else: text = text.replace(p, " ") return text # 前処理と単語分割をまとめた関数を定義 # 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意 def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize): text = preprocessing_text(text) ret = tokenizer(text) # tokenizer_bert return ret # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します max_length = 256 TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=False, include_lengths=True, batch_first=True, fix_length=max_length, init_token="[CLS]", eos_token="[SEP]", pad_token='[PAD]', unk_token='[UNK]') LABEL = torchtext.data.Field(sequential=False, use_vocab=False) # フォルダ「data」から各csvファイルを読み込みます # BERT用で処理するので、10分弱時間がかかります train_val_ds, test_ds = torchtext.data.TabularDataset.splits( path=DATA_PATH, train='train.csv', test='test.csv', format='csv', fields=[('Text', TEXT), ('Label', LABEL)]) vocab_bert, ids_to_tokens_bert = load_vocab(vocab_file=VOCAB_FILE) TEXT.build_vocab(train_val_ds, min_freq=1) TEXT.vocab.stoi = vocab_bert batch_size = 32 # BERTでは16、32あたりを使用する train_dl = torchtext.data.Iterator(train_val_ds, batch_size=batch_size, train=True) val_dl = torchtext.data.Iterator(test_ds, batch_size=batch_size, train=False, sort=False) # 辞書オブジェクトにまとめる dataloaders_dict = {"train": train_dl, "val": val_dl} return train_dl, val_dl, TEXT, dataloaders_dict
class tmv_torch_bert_classify(lreg.tmv_tf_log_regress_classify): def __init__(self, data_dir=r'./', bert_dir=r'./pytorch_advanced/nlp_sentiment_bert/'): self.data_dir = data_dir self.bert_dir = bert_dir self.tokenizer_bert = BertTokenizer( vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt", do_lower_case=True) self.vocab_bert, self.ids_to_tokens_bert = load_vocab( vocab_file=self.bert_dir + "vocab/bert-base-uncased-vocab.txt") config = get_config(file_path=self.bert_dir + "weights/bert_config.json") self.net_bert = BertModel(config) self.net_bert = set_learned_params(self.net_bert, weights_path=self.bert_dir + "weights/pytorch_model.bin") def load_data(self, csv_file_kspa, dependent_var, langs=None, task_word='Definition', answer_ex_clm='Definition'): self.dependent_var = dependent_var self.answer_ex_clm = answer_ex_clm self.df_response_answer_ex = pd.read_csv(self.data_dir + csv_file_kspa, encoding='latin1') self.df_response_answer_ex = self.df_response_answer_ex.set_index( r'Student_Question_Index') if langs != None: lang_clm = task_word + r'-Language' self.df_response_answer_ex = \ self.df_response_answer_ex[self.df_response_answer_ex[lang_clm].isin(langs)] self.ans_clm = task_word + r'-Answer' self.ans_and_ex_clm = task_word + r'-Answer-and-Example' self.df_response_answer_ex[self.ans_and_ex_clm] = self.df_response_answer_ex[self.answer_ex_clm] \ + ' ' + self.df_response_answer_ex[self.ans_clm] # to move LABEL and TXT columns to the end columns = list(self.df_response_answer_ex.columns) columns.remove(self.dependent_var) columns.remove(self.ans_and_ex_clm) columns.append(self.dependent_var) columns.append(self.ans_and_ex_clm) self.df_ac_modeling_values = self.df_response_answer_ex.reindex( columns=columns) def get_tokens(self): def preprocessing_text(text): for p in string.punctuation: if (p == ".") or (p == ","): continue else: text = text.replace(p, " ") text = text.replace(".", " . ") text = text.replace(",", " , ") return text def tokenizer_with_preprocessing(text, tokenizer=self.tokenizer_bert.tokenize ): text = preprocessing_text(text) ret = tokenizer(text) # tokenizer_bert return ret TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="[CLS]", eos_token="[SEP]", pad_token='[PAD]', unk_token='[UNK]') LABEL = torchtext.data.Field(sequential=False, use_vocab=False) fields = [(None, None)] * (len(self.df_response_answer_ex.columns) - 1) fields.append(('Label', LABEL)) fields.append(('Text', TEXT)) train_val_ds = torchtext.data.TabularDataset( path=self.modeling_data_file_name, format='csv', fields=fields, skip_header=True) TEXT.build_vocab(train_val_ds, min_freq=1) TEXT.vocab.stoi = self.vocab_bert return train_val_ds # Modified by [email protected] 11/22/2020 def perform_modeling(self, df_ac_modeling_data, key_word=r'', csv_dump=False, number_class=3, epochs=10, batch_size=32, tmp_csv_name='TORCH_RESPONSE_ANSWER_EX_FILE.CSV', bert_pkl_name='weights/bert_fine_tuning_VDOK_'): self.modeling_data_file_name = self.data_dir + tmp_csv_name self.batch_size = batch_size df_ac_modeling_data_buf = df_ac_modeling_data.copy() if self.ans_and_ex_clm not in df_ac_modeling_data_buf.columns: df_ac_modeling_data_buf[self.ans_and_ex_clm] = df_ac_modeling_data_buf[self.answer_ex_clm] \ + ' ' + df_ac_modeling_data_buf[self.ans_clm] # to move LABEL and TXT columns to the end columns = list(df_ac_modeling_data_buf.columns) columns.remove(self.dependent_var) columns.remove(self.ans_and_ex_clm) columns.append(self.dependent_var) columns.append(self.ans_and_ex_clm) df_ac_modeling_data_buf = df_ac_modeling_data_buf.reindex( columns=columns) df_ac_modeling_data_buf.to_csv(self.modeling_data_file_name) train_val_ds = self.get_tokens() train_ds, val_ds = train_val_ds.split( split_ratio=0.8, random_state=random.seed(random_seed)) train_dl = torchtext.data.Iterator(train_ds, batch_size=self.batch_size, train=True) val_dl = torchtext.data.Iterator(val_ds, batch_size=self.batch_size, train=False, sort=False) self.dataloaders_dict = {"train": train_dl, "val": val_dl} batch = next(iter(val_dl)) print(batch.Text) print(batch.Label) text_minibatch_1 = (batch.Text[0][1]).numpy() text = self.tokenizer_bert.convert_ids_to_tokens(text_minibatch_1) print(text) print('Building model...') net = BertForVDOK(self.net_bert, number_class) net.train() for name, param in net.named_parameters(): param.requires_grad = False for name, param in net.bert.encoder.layer[-1].named_parameters(): param.requires_grad = True for name, param in net.cls.named_parameters(): param.requires_grad = True optimizer = optim.Adam( [{ 'params': net.bert.encoder.layer[-1].parameters(), 'lr': 5e-5 }, { 'params': net.cls.parameters(), 'lr': 5e-5 }], betas=(0.9, 0.999)) self.criterion = nn.CrossEntropyLoss() self.net_trained = self.train_model(net, self.dataloaders_dict, self.criterion, optimizer, num_epochs=epochs) # Modified by [email protected] 11/22/2020 # save_path = self.bert_dir + 'weights/bert_fine_tuning_VDOK_' + key_word + '.pth' # torch.save(self.net_trained.state_dict(), save_path) save_path = self.bert_dir + bert_pkl_name + key_word + '.pkl' with open(save_path, 'wb') as f: cloudpickle.dump(self.net_trained, f) def train_model(self, net, dataloaders_dict, criterion, optimizer, num_epochs): device = torch.device("cuda:0" if torch.cuda.is_available() else 'cpu') print('Using device: ', device) print('-----start-------') net.to(device) torch.backends.cudnn.benchmark = True batch_size = dataloaders_dict["train"].batch_size for epoch in range(num_epochs): for phase in ['train', 'val']: if phase == 'train': net.train() else: net.eval() epoch_loss = 0.0 epoch_corrects = 0 iteration = 1 t_epoch_start = time.time() t_iter_start = time.time() for batch in (dataloaders_dict[phase]): inputs = batch.Text[0].to(device) labels = batch.Label.to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = net(inputs, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False, attention_show_flg=False) loss = criterion(outputs, labels) _, preds = torch.max(outputs, 1) if phase == 'train': loss.backward() optimizer.step() if (iteration % 10 == 0): t_iter_finish = time.time() duration = t_iter_finish - t_iter_start acc = (torch.sum(preds == labels.data) ).double() / batch_size print( 'Iteration {} || Loss: {:.4f} || 10iter: {:.4f} sec. || Accuracy: {}' .format(iteration, loss.item(), duration, acc)) t_iter_start = time.time() iteration += 1 epoch_loss += loss.item() * batch_size epoch_corrects += torch.sum(preds == labels.data) t_epoch_finish = time.time() epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset) epoch_acc = epoch_corrects.double() / len( dataloaders_dict[phase].dataset) print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format( epoch + 1, num_epochs, phase, epoch_loss, epoch_acc)) t_epoch_start = time.time() return net def perform_prediction(self, df_ac_prediction_data, number_class): self.df_ac_predict_target = df_ac_prediction_data.loc[:, [ self.dependent_var ]] df_ac_prediction_data_buf = df_ac_prediction_data.copy() if self.ans_and_ex_clm not in df_ac_prediction_data_buf.columns: df_ac_prediction_data_buf[self.ans_and_ex_clm] = df_ac_prediction_data_buf[self.answer_ex_clm] \ + ' ' + df_ac_prediction_data_buf[self.ans_clm] # to move LABEL and TXT columns to the end columns = list(df_ac_prediction_data_buf.columns) columns.remove(self.dependent_var) columns.remove(self.ans_and_ex_clm) columns.append(self.dependent_var) columns.append(self.ans_and_ex_clm) df_ac_prediction_data_buf = df_ac_prediction_data_buf.reindex( columns=columns) df_ac_prediction_data_buf.to_csv(self.modeling_data_file_name) test_ds = self.get_tokens() test_dl = torchtext.data.Iterator(test_ds, batch_size=self.batch_size, train=False, sort=False) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.net_trained.eval() self.net_trained.to(device) epoch_corrects = 0 self.predict_res = [] for batch in tqdm(test_dl): device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") inputs = batch.Text[0].to(device) labels = batch.Label.to(device) with torch.set_grad_enabled(False): outputs = self.net_trained(inputs, token_type_ids=None, attention_mask=None, output_all_encoded_layers=False, attention_show_flg=False) loss = self.criterion(outputs, labels) _, preds = torch.max(outputs, 1) epoch_corrects += torch.sum(preds == labels.data) self.predict_res += preds.tolist() epoch_acc = epoch_corrects.double() / len(test_dl.dataset) print('Test Data {} Accuracy: {:.4f}'.format(len(test_dl.dataset), epoch_acc)) self.df_ac_classified = pd.DataFrame( np.array(self.predict_res, dtype=np.int64), df_ac_prediction_data.index, [r'Score_Class']) self.df_ac_classified[self.dependent_var] = self.df_ac_predict_target[ self.dependent_var] def modeling_prediction_evaluation_all(self, key_word=r'', csv_dump=False, number_class=3, epochs=10, batch_size=32): self.df_ac_predict_target_all = pd.DataFrame() self.predict_res_all = np.array([], np.int64) self.df_ac_classified_all = pd.DataFrame() for x in range(len(self.random_order_set)): print(r'----------------') print(r'RANDOM SET: ', x) self.iloc_concat_for_cross_validation(x) self.perform_modeling( self.df_ac_modeling_values.iloc[ self.concatenated_value_order, :], key_word, csv_dump, number_class, epochs) self.perform_prediction( self.df_ac_modeling_values.iloc[self.random_order_set[x], :], number_class) self.evaluate_prediction(key_word) if len(self.df_ac_predict_target_all) == 0: self.df_ac_predict_target_all = self.df_ac_predict_target.copy( ) else: self.df_ac_predict_target_all = self.df_ac_predict_target_all.append( self.df_ac_predict_target) self.predict_res_all = np.append(self.predict_res_all, self.predict_res) if len(self.df_ac_classified_all) == 0: self.df_ac_classified_all = self.df_ac_classified.copy() self.df_indices_all = pd.DataFrame(self.se_indices) else: self.df_ac_classified_all = self.df_ac_classified_all.append( self.df_ac_classified) self.df_indices_all = pd.concat( [self.df_indices_all, self.se_indices], axis=1) self.df_indices_all = self.df_indices_all.T print(r'----------------') print(r'ALL DATA (Macro Average):') print(self.df_indices_all.describe()) if csv_dump == True: self.df_indices_all.describe().to_csv( self.data_dir + r'Classified-Prediction-Indices-Macro-' + key_word + r'.csv', encoding='latin1') print(r'----------------') print(r'ALL DATA (Micro Average):') self.evaluate_prediction( key_word, csv_dump=True, df_ac_predict_target=self.df_ac_predict_target_all, predict_res=self.predict_res_all) # Modified by [email protected] on 11/22/2020 def restore_model(self, key_word=r'', tmp_csv_name='TORCH_RESPONSE_ANSWER_EX_FILE.CSV', bert_pkl_name='weights/bert_fine_tuning_VDOK_', batch_size=32): self.modeling_data_file_name = self.data_dir + tmp_csv_name self.batch_size = batch_size save_path = self.bert_dir + bert_pkl_name + key_word + '.pkl' with open(save_path, 'rb') as f: self.net_trained = cloudpickle.load(f) self.criterion = nn.CrossEntropyLoss()
def tokenizer_with_preprocessing(text): tokenizer_bert = BertTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False) text = preprocessing_text(text) ret = tokenizer_bert.tokenize(text) return ret
# カンマ、ピリオド以外の記号をスペースに置換 for p in string.punctuation: if (p == ".") or (p == ","): continue else: text = text.replace(p, " ") # ピリオドなどの前後にはスペースを入れておく text = text.replace(".", " . ") text = text.replace(",", " , ") return text # 単語分割用のTokenizerを用意 tokenizer_bert = BertTokenizer( vocab_file="./weights/bert-base-uncased-vocab.txt", do_lower_case=True) # 前処理と単語分割をまとめた関数を定義 # 単語分割の関数を渡すので、tokenizer_bertではなく、tokenizer_bert.tokenizeを渡す点に注意 def tokenizer_with_preprocessing(text, tokenizer=tokenizer_bert.tokenize): text = preprocessing_text(text) ret = tokenizer(text) # tokenizer_bert return ret def main(): # define output dataframe sample = pd.read_csv("./data/sample_submission.csv") # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します max_length = 256
def __init__(self, vocab_file, max_text_length, **kwargs): self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=True) self.text_field, self.label_field = self._prepare(max_text_length) self.vocab, self.ids_to_tokens = self._load_vocab(vocab_file)