def create_masks(self, inp, tar): # 编码器填充遮挡 if inp is None or tar is None: return None, None, None enc_padding_mask = create_padding_mask(inp) # 在解码器的第二个注意力模块使用。 # 该填充遮挡用于遮挡编码器的输出。 dec_padding_mask = create_padding_mask(inp) # 在解码器的第一个注意力模块使用。 # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。 look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask
def forward(self, enc_in, dec_in, padding_mask=None): enc_in, dec_in = self.embed_input(enc_in, dec_in) if self.encoder_reduce_dim: padding_mask = u.create_padding_mask(enc_in, self.pad_idx) enc_out = self.encoder(enc_in, padding_mask=padding_mask) dec_out = self.decoder(dec_in, enc_out, padding_mask=padding_mask) output = self.output_projection(dec_out) return self.softmax(output) if self.apply_softmax else output
def custom_collator(self, batch): encoder_inputs, decoder_inputs = zip(*batch) encoder_input_batch = pad_sequence( encoder_inputs, batch_first=True, padding_value=self.vocab_2_idx[self.pad]) decoder_input_batch = torch.LongTensor( u.pad_documents(decoder_inputs, self.vocab_2_idx[self.pad])) if self.create_mask: padding_mask_batch = u.create_padding_mask( encoder_input_batch, self.vocab_2_idx[self.pad]) return encoder_input_batch, decoder_input_batch, padding_mask_batch return encoder_input_batch, decoder_input_batch
def __call__(self, batch): encoder_inputs, decoder_inputs = zip(*batch) encoder_input_batch = pad_sequence(encoder_inputs, batch_first=True, padding_value=self.pad_idx).float() decoder_input_batch = pad_sequence(decoder_inputs, batch_first=True, padding_value=self.pad_idx) if self.create_mask: padding_mask_batch = u.create_padding_mask(encoder_input_batch, self.pad_idx) return encoder_input_batch, decoder_input_batch, padding_mask_batch return encoder_input_batch, decoder_input_batch
def evaluate(test_dataset): predictions = [] tars = [] for (batch, (inp, tar)) in tqdm(enumerate(test_dataset)): enc_padding_mask = create_padding_mask(inp) predict = transformer(inp, False, enc_padding_mask=enc_padding_mask) predictions.append(predict) tars.append(tar) predictions = tf.concat(predictions, axis=0) tars = tf.concat(tars, axis=0) mi_f1 = micro_f1(tars, predictions) ma_f1 = macro_f1(tars, predictions) predictions = np.where(predictions > 0.5, 1, 0) tars = np.where(tars > 0.5, 1, 0) smaple_f1 = f1_score(tars, predictions, average='samples') return mi_f1, ma_f1, smaple_f1, tars, predictions
def train_step(inp, tar): enc_padding_mask = create_padding_mask(inp) with tf.GradientTape() as tape: predictions = transformer(inp, training=True, enc_padding_mask=enc_padding_mask) loss = loss_function(tar, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar, predictions) mi_f1 = micro_f1(tar, predictions) ma_f1 = macro_f1(tar, predictions) return mi_f1, ma_f1
def dataset_generator(folder, batch_size=32, metadata=None): ''' Custom generator that reads data from folder then yield batches Params: * folder : str * batch_size (optional) : int, default to 32 * metadata (optional) : dict, default to None ''' filelist = get_list_files(folder) save_name = '{}_{}.pk'.format('metadata', folder.split('/')[-2]) if metadata is None: metadata = load_metadata(save_name) batch = [[], []] num_els = len(filelist['features']) for i, filename in enumerate(filelist['features']): identity = filename.split('/')[-1].replace('.features.npy', '') encoder_input = torch.tensor(np.load(filename)) decoder_input = metadata['id_2_doc'][identity] batch[0].append(encoder_input) batch[1].append(decoder_input) if len(batch[0]) == batch_size or i == num_els - 1: encoder_input_batch = pad_sequence( batch[0], batch_first=True, padding_value=metadata['PAD'].index) decoder_input_batch = torch.LongTensor(batch[1]) padding_mask_batch = u.create_padding_mask(encoder_input_batch, metadata['PAD'].index) yield encoder_input_batch, decoder_input_batch, padding_mask_batch batch = [[], []]
def preprocess(file, BATCH_SIZE, max_length, tokenizer): train_dataset = [] input_vocab_size = len(tokenizer.vocab) f = open(file, 'r') words = f.read() words = words.replace('\n\n', '.') words = words.replace('\n', ' ') words = re.split('[;:.!?]', words) i = 0 for _ in range(len(words) // BATCH_SIZE + 1): if i + 1 >= len(words): break input_ids_list = [] segment_list = [] is_masked_list = [] is_next_list = [] for j in range(BATCH_SIZE): if i + 1 >= len(words): break now = int( random.random() > 0.5 ) # decide if the 2nd sentence has to be next sentence or not if now == 1: res = ["[CLS]"] + tokenizer.tokenize(words[i]) + [ "[SEP]" ] + tokenizer.tokenize(words[i + 1]) + ["[SEP]"] else: res = ["[CLS]"] + tokenizer.tokenize( words[i]) + ["[SEP]"] + tokenizer.tokenize( words[random.randint(0, len(words) - 1)]) + ["[SEP]"] input_ids = get_ids(res, tokenizer, max_length) segment_list.append(get_segments(res, max_length)) is_next_list.append(now) is_masked = [0] * max_length for ind in range(max_length): if input_ids[ind] == 0: # is padding token appears, then break break if input_ids[ind] == 101 or input_ids[ ind] == 102: # don't mask [CLS] and [SEP] tokens continue if random.random() < 0.15: # mask 15% of tokens is_masked[ind] = input_ids[ind] if random.random() < 0.8: # out of 15%, mask 80% input_ids[ind] = 103 elif random.random( ) < 0.5: # replace 10% with random token input_ids[ind] = random.randint(1000, input_vocab_size) #in the remaining tokens, keep the same token input_ids_list.append(input_ids) is_masked_list.append(is_masked) if now == 1: i += 2 else: i += 1 input_ids_list = np.array(input_ids_list) is_masked_list = np.array(is_masked_list) masks = create_padding_mask(input_ids_list) segment_list = np.array(segment_list) is_next_list = np.array(is_next_list) is_next_list = np.reshape(is_next_list, (len(is_next_list), 1)) train_dataset.append([ input_ids_list, segment_list, masks, is_next_list, is_masked_list ]) return train_dataset
epsilon=1e-9) for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() print('Start Train......') for (batch, (inp, tar)) in enumerate(train_dataset): time1 = time.time() mic_f1, mac_f1 = train_step(inp, tar) if batch % 50 == 0: test_input, test_target = next(iter(valid_dataset)) enc_padding_mask = create_padding_mask(test_input) val_mic_f1, val_mac_f1 = predict(test_input, test_target, enc_padding_mask) print( 'Epoch {} Batch {} Loss {:.4f} micro_f1 {:.4f} macro_f1 {:.4f} val_micro_f1 {:.4f} val_macro_f1 {:.4f}' .format(epoch + 1, batch, train_loss.result(), mic_f1, mac_f1, val_mic_f1, val_mac_f1)) print('Cost time:{}'.format(time.time() - time1)) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format( epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(