def convert_data_to_feature(): #載入問題資料集 q = open('Dataset/Query_Train/Final_question.txt', "r", encoding="utf-8") questions = q.readlines() q.close() #載入答案資料集 a = open('Dataset/Train_Label/FinalDomainLabel.txt', "r", encoding="utf-8") answers = a.readlines() a.close() assert len(answers) == len(questions) # ans_dic 表示answer的類別 ans_dic = make_ans_dic(answers) # question_dic 表示question的類別 question_dic = make_question_dic(questions) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') q_tokens = [] max_seq_len = 0 for q in question_dic.data: bert_ids = tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))) if (len(bert_ids) > max_seq_len): max_seq_len = len(bert_ids) q_tokens.append(bert_ids) print("最長問句長度:", max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 # 補齊長度 for q in q_tokens: while len(q) < max_seq_len: q.append(0) a_labels = [] for a in ans_dic.data: a_labels.append(ans_dic.to_id(a)) # BERT input embedding answer_lables = a_labels input_ids = q_tokens input_masks = [[1] * max_seq_len for i in range(len(question_dic))] input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))] assert len(input_ids) == len(question_dic) and len(input_ids) == len( input_masks) and len(input_ids) == len(input_segment_ids) data_features = { 'input_ids': input_ids, 'input_masks': input_masks, 'input_segment_ids': input_segment_ids, 'answer_lables': answer_lables, 'question_dic': question_dic, 'answer_dic': ans_dic } output = open('Dataset/data_features_domain.pkl', 'wb') pickle.dump(data_features, output) return data_features
def __init__(self, bert_tokenizer: BertTokenizer, jp_tokenizer: JumanTokenizer, args, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, 'cached_lm_' + str(block_size) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) self.examples = [] with open(file_path, encoding="utf-8") as f: docs = f.readlines() exsamples = [] for _, line in enumerate(docs): text = line.rstrip(os.linesep) # separate text into tokens tokenized_text = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.tokenize(" ".join( jp_tokenizer.tokenize(text)))) # add special tokkens : [CLS] and [SEP] added_special = bert_tokenizer.build_inputs_with_special_tokens( tokenized_text) # Zero-pad up to the sequence length. diff = block_size - len(added_special) if diff < 0: added_special = added_special[:diff] else: # padding を 0 -> -1に変更 padding = [-1] * (block_size - len(added_special)) added_special += padding assert len(added_special) == block_size self.examples.append(added_special) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, bert_tokenizer: BertTokenizer, jp_tokenizer: JumanTokenizer, args, file_path='train', block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, 'dialogue_for_nsp' + '_cached_lm_' + str(block_size) + '_' + filename) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as handle: self.examples, \ self.token_type_ids, \ self.attention_mask, \ self.next_sentence_label = pickle.load(handle) else: # キャッシュされたデータファイルがなければテキストファイルからデータセットを作成 logger.info("Creating features from dataset file at %s", directory) self.examples = [] # [CLS] A A A [SEP] B B B [SEP] self.token_type_ids = [] # 0 0 0 0 0 1 1 1 1 self.attention_mask = [] # 1 1 1 1 1 1 1 1 1 0 0 0 0 ... self.next_sentence_label = [] # [0, 1] 0: isNext, 1: notNext with open(file_path, encoding="utf-8") as f: docs = f.readlines() exsamples = [] ZEN = "".join(chr(0xff01 + i) for i in range(94)) HAN = "".join(chr(0x21 + i) for i in range(94)) HAN2ZEN = str.maketrans(HAN, ZEN) num_doc = len(docs) for idx, line in enumerate(docs): text = line.rstrip(os.linesep) if text == "": continue try: next_text = docs[idx + 1].rstrip(os.linesep) except IndexError: continue if next_text == "": continue if random.random() > args.nsp_swap_ratio: while True: rand_idx = random.randrange(0, num_doc) next_text = docs[rand_idx].rstrip(os.linesep) if (not next_text == "") and (rand_idx != idx + 1): break nsp_label = 1 # random sequence else: nsp_label = 0 # continuation sequence # jumanエラー対策 text = text.replace(' ', ' ') next_text = next_text.replace(' ', ' ') text = mojimoji.han_to_zen(text, kana=False, digit=True, ascii=True) next_text = mojimoji.han_to_zen(next_text, kana=False, digit=True, ascii=True) text = text.translate(HAN2ZEN) next_text = next_text.translate(HAN2ZEN) # 元テキストを区切った状態に if len(text.encode('utf-8')) > 4096 or len( next_text.encode('utf-8')) > 4096: continue first_tokenized_text = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.tokenize(" ".join( jp_tokenizer.tokenize(text)))) second_tokenized_text = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.tokenize(" ".join( jp_tokenizer.tokenize(next_text)))) fst_len = len(first_tokenized_text) scd_len = len(second_tokenized_text) # for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size # self.examples.append(bert_tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size])) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. # add special tokens # A A A (B B B) -> [CLS] A A A [SEP] (B B B [SEP]) added_special = bert_tokenizer.build_inputs_with_special_tokens( token_ids_0=first_tokenized_text, token_ids_1=second_tokenized_text) # token type ids type_ids = [0] * (2 + fst_len) scd_type = [1] * (1 + scd_len) type_ids += scd_type attention_mask = [1] * len(added_special) # Zero-pad up to the sequence length. diff = block_size - len(added_special) if diff < 0: added_special = added_special[:diff] type_ids = type_ids[:diff] attention_mask = attention_mask[:diff] else: padding = [0] * (block_size - len(added_special)) padding_1 = [0] * (block_size - len(added_special)) padding_2 = [0] * (block_size - len(added_special)) added_special += padding type_ids += padding_1 attention_mask += padding_2 assert len(added_special) == block_size assert len(type_ids) == block_size assert len(attention_mask) == block_size self.examples.append(added_special) self.token_type_ids.append(type_ids) self.attention_mask.append(attention_mask) self.next_sentence_label.append(nsp_label) logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as handle: pickle.dump([ self.examples, self.token_type_ids, self.attention_mask, self.next_sentence_label ], handle, protocol=pickle.HIGHEST_PROTOCOL)
def convert_data_to_feature(): with open('Taipei_QA_new.txt', 'r', encoding='utf-8') as f: data = f.read() qa_pairs = data.split("\n") questions = [] answers = [] for qa_pair in qa_pairs: qa_pair = qa_pair.split() try: a, q = qa_pair questions.append(q) answers.append(a) except: continue assert len(answers) == len(questions) ans_dic = make_ans_dic(answers) question_dic = make_question_dic(questions) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') q_tokens = [] max_seq_len = 0 for q in question_dic.data: bert_ids = tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))) if (len(bert_ids) > max_seq_len): max_seq_len = len(bert_ids) q_tokens.append(bert_ids) # print(tokenizer.convert_ids_to_tokens(tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))))) print("最長問句長度:", max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 # 補齊長度 for q in q_tokens: while len(q) < max_seq_len: q.append(0) a_labels = [] for a in ans_dic.data: a_labels.append(ans_dic.to_id(a)) # print (ans_dic.to_id(a)) # BERT input embedding answer_lables = a_labels input_ids = q_tokens input_masks = [[1] * max_seq_len for i in range(len(question_dic))] input_segment_ids = [[0] * max_seq_len for i in range(len(question_dic))] assert len(input_ids) == len(question_dic) and len(input_ids) == len( input_masks) and len(input_ids) == len(input_segment_ids) data_features = { 'input_ids': input_ids, 'input_masks': input_masks, 'input_segment_ids': input_segment_ids, 'answer_lables': answer_lables, 'question_dic': question_dic, 'answer_dic': ans_dic } output = open('trained_model/data_features.pkl', 'wb') pickle.dump(data_features, output) return data_features
def toBertIds(q_input): tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') return tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q_input)))
def convert_data_to_feature(FileName): # 載入字典 tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') # 載入資料 Labels = [] Sentences = [] with open(FileName,'r',encoding='utf-8') as f: data = f.read() LS_pairs = data.split("\n") for LS_pair in LS_pairs: if LS_pair != "": try: L = LS_pair[:1] S = LS_pair[2:] Labels.append(int(L)) Sentences.append(S) except: continue assert len(Labels) == len(Sentences) # BERT input embedding max_seq_len = 0 # 紀錄最大長度 input_ids = [] original_length = [] # 紀錄原本長度 for S in Sentences: # 將句子切割成一個個token word_piece_list = tokenizer.tokenize(S) # 將token轉成字典中的id input_id = tokenizer.convert_tokens_to_ids(word_piece_list) # 補上[CLS]和[SEP] input_id = tokenizer.build_inputs_with_special_tokens(input_id) if(len(input_id)>max_seq_len): max_seq_len = len(input_id) input_ids.append(input_id) print("最長句子長度:",max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 # 補齊長度 for c in input_ids: # 紀錄原本長度 length = len(c) original_length.append(length) while len(c)<max_seq_len: c.append(0) segment_ids = [[0]*max_seq_len for i in range(len(Sentences))] # token_type_ids # segment_ids存儲的是句子的id,id為0就是第一句,id為1就是第二句 position_ids = [] # attention_mask # position_ids:1代表是真實的單詞id,0代表補全位 for i in range(len(Sentences)): position_id = [] for j in range(original_length[i]): position_id.append(1) while len(position_id)<max_seq_len: position_id.append(0) position_ids.append(position_id) assert len(input_ids) == len(segment_ids) and len(input_ids) == len(position_ids) and len(input_ids) == len(Labels) data_features = {'input_ids':input_ids, 'segment_ids':segment_ids, 'position_ids':position_ids, 'labels':Labels} return data_features
print_tokenizer_special(tokenizer) x = IncrementalDataset(idr.dataset, transform=transforms.Compose([ Padding(max_length=idr.max_length), ToTensor(tokenizer) ])) a = DataLoader(x, batch_size=10, sampler=SubsetRandomSampler(x.sampler)) for i, s in enumerate(a): print(i) print(s) tokenizer.ids_to_tokens[0] tokenizer.convert_ids_to_tokens tokenizer.vocab.keys() tokenizer.build_inputs_with_special_tokens([95, 209], [95, 209]) tokenizer.pretrained_vocab_files_map tokenizer.encode("<BOS> I like tea") s = idr.dataset.tokens[0] tokenizer.decode(tokenizer.encode(" ".join(s))) encoded_tensor = torch.Tensor([ tokenizer.encode(idr.dataset.partial[j]) for j in range(len(idr.dataset.partial)) ]) encoded_tensor = torch.Tensor(tokenizer.encode(idr.dataset.partial[0])) model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True) model.eval() with torch.no_grad():
def convert_data_to_feature(FileName): with open(FileName, 'r', encoding='utf-8') as f: data = f.read() qa_pairs = data.split("\n") questions = [] answers = [] for qa_pair in qa_pairs: qa_pair = qa_pair.split() try: a, q = qa_pair questions.append(q) answers.append(a) except: continue assert len(answers) == len(questions) ans_dic = make_ans_dic(answers) question_dic = make_question_dic(questions) tokenizer = BertTokenizer(vocab_file='bert-base-chinese-vocab.txt') q_tokens = [] max_seq_len = 0 for q in question_dic.data: # print(tokenizer.tokenize(q)) bert_ids = tokenizer.build_inputs_with_special_tokens( tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))) if (len(bert_ids) > max_seq_len): max_seq_len = len(bert_ids) q_tokens.append(bert_ids) # print(bert_ids) # print(tokenizer.convert_ids_to_tokens(tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(q))))) print("最長問句長度:", max_seq_len) assert max_seq_len <= 512 # 小於BERT-base長度限制 original_length = [] # 補齊長度 for q in q_tokens: # 紀錄原本長度 length = len(q) original_length.append(length) while len(q) < max_seq_len: q.append(0) a_labels = [] for a in ans_dic.data: a_labels.append(ans_dic.to_id(a)) # print (ans_dic.to_id(a)) # BERT input embedding input_ids = q_tokens input_segment_ids = [ [0] * max_seq_len for i in range(len(question_dic)) ] # token_type_ids # segment_ids存儲的是句子的id,id為0就是第一句,id為1就是第二句 input_masks = [] # position_ids:1代表是真實的單詞id,0代表補全位 for i in range(len(question_dic)): position_ids = [] for j in range(original_length[i]): position_ids.append(1) while len(position_ids) < max_seq_len: position_ids.append(0) input_masks.append(position_ids) answer_lables = a_labels assert len(input_ids) == len(question_dic) and len(input_ids) == len( input_masks) and len(input_ids) == len(input_segment_ids) data_features = { 'input_ids': input_ids, 'input_segment_ids': input_segment_ids, 'input_masks': input_masks, 'answer_lables': answer_lables, 'question_dic': question_dic, 'answer_dic': ans_dic } # 因為train_data保證可以拿到所有類別的資料,且之後要做predict時才能將預測的結果做正確轉換 if FileName == 'train_data.txt': fp = open('trained_model/data_features.pkl', 'wb') pickle.dump(data_features, fp) fp.close() return data_features