def embed_target(self, target_ids, seq_lens, mask_lm=True): target_ids = get_tensor_from_array(target_ids).long() embedding_output = self.pretrained_word_embeddings(target_ids) predict_mask = None recon_target = None if mask_lm: input_mask, predict_mask = get_mlm_masks(target_ids, self.mask_prob, self.mask_but_no_prob) input_mask = input_mask.unsqueeze(2) embedding_output = self.use_pretrained_mask_embedding( embedding_output, input_mask) recon_target = embedding_output recon_target = self.pad_front(recon_target, 0) target_ids = self.pad_front(target_ids, 0) embedding_output, seq_lens = self.wrap_with_embeddings( embedding_output, seq_lens) embedding_output = self.add_beside_word_embeddings(embedding_output) embedding_output = self.pretrained_embedding_layer_norm( embedding_output) attention_mask = get_attention_mask(seq_lens, target_ids.shape[1]) if predict_mask is not None: predict_mask = self.pad_front(predict_mask, 1) return recon_target, embedding_output, attention_mask, predict_mask
def predict(self, frame_feat, lens): frame_feat = get_tensor_from_array(frame_feat) mask = get_attention_mask(lens, frame_feat.shape[1]) x = self.feat_embeddings(frame_feat) x = self.positional_encoding(x) outputs = self.forward(x, mask) outputs = self.target_out_layer(outputs) return outputs
def finetune_loss(self, frame_feat, frame_label, lens): outputs = self.predict(frame_feat, lens) outputs = outputs.transpose(1, 2) frame_label = get_tensor_from_array(frame_label).long() loss = nn.CrossEntropyLoss(reduction='none')(outputs, frame_label) mask = get_attention_mask(lens, frame_feat.shape[1]) loss = masked_reduce_mean(loss, mask) loss = loss.mean() return loss
def test_create_attention_mask(self): seq_lens = np.array([3, 5, 1, 0, 9, 100]) max_len = 9 mask = get_attention_mask(seq_lens, max_len) mask = mask.data.numpy() ans = np.array([ [1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], ]) self.assertTrue(np.all(mask == ans))
def embed_feats(self, feats, seq_lens, mask_lm=True): feats = get_tensor_from_array(feats) embedding_output = self.feat_embeddings(feats) predict_mask = None if mask_lm: input_mask, predict_mask = get_mlm_masks(feats, self.mask_prob, self.mask_but_no_prob) embedding_output = self.use_pretrained_mask_embedding( embedding_output, input_mask) feats = self.pad_front(feats, 0) embedding_output, seq_lens = self.wrap_with_embeddings( embedding_output, seq_lens) embedding_output = self.add_beside_word_embeddings(embedding_output) attention_mask = get_attention_mask(seq_lens, feats.shape[1]) if predict_mask is not None: predict_mask = self.pad_front(predict_mask, 1) return feats, embedding_output, attention_mask, predict_mask
def pretrain_loss(self, input_feats, seq_lens): input_feats = get_tensor_from_array(input_feats) attention_mask = get_attention_mask(seq_lens, input_feats.shape[1]) input_mask, predict_mask = get_mlm_masks(input_feats, self.mask_prob, self.mask_but_no_prob) masked_input_feats = input_mask * input_feats + ( 1 - input_mask) * self.feat_mask_vec masked_input_feats *= attention_mask.unsqueeze( 2) # taking care of the paddings x = self.feat_embeddings(masked_input_feats) x = self.positional_encoding(x) output = self.forward(x, attention_mask) output = self.feat_out_layer(output) to_predict = (1 - predict_mask.squeeze()) * attention_mask # shape: (N, T) loss = cpc_loss(output, input_feats, to_predict, attention_mask) return loss