def prediction_from_trained_model( self, ys, encoder_outputs, scores_list, ): """####this function is accessed from the decoder to get the output from the decoder, and this could be used for model ensembling an ####when this function is called with prediceted label sequences, it gives the proability distribution for the next possible labels roughly this gives P(y_i |y_(i<i) """ non_pad_mask = torch.ones_like(ys).float().unsqueeze(-1) # 1xix1 slf_attn_mask = get_subsequent_mask(ys) # -- Forward #dec_output=self.positional_encoding(ys) dec_output = self.positional_encoding(self.tgt_word_emb(ys)) for dec_layer in self.layer_stack: dec_output, _, _ = dec_layer(dec_output, encoder_outputs, non_pad_mask=None, slf_attn_mask=slf_attn_mask, dec_enc_attn_mask=None) dec_output_Bneck = self.output_norm(dec_output) dec_output = self.output_norm(dec_output[:, -1]) seq_logit = self.tgt_word_prj(dec_output) scores_list.append(seq_logit.unsqueeze(1)) local_scores = F.log_softmax(seq_logit, dim=1) scores = F.softmax(seq_logit, dim=1) present_label = torch.argmax(scores, dim=1) return local_scores, scores_list, present_label, dec_output_Bneck
def forward(self, padded_input, encoder_padded_outputs, return_attns=True): """ Args: padded_input: N x To encoder_padded_outputs: N x Ti x H Returns:""" ##################################################################### cuda_flag = encoder_padded_outputs.is_cuda device_flag = torch.device("cuda") if cuda_flag else torch.device( "cpu") #=================================================================== dec_slf_attn_list, dec_enc_attn_list = [], [] #Get Deocder Input and Output ys_in_pad, ys_out_pad = self.preprocess(padded_input) #Prepare masks slf_attn_mask_subseq = get_subsequent_mask(ys_in_pad) output_length = ys_in_pad.size(1) ###Not using other masks, using Justs casual masks non_pad_mask = None dec_enc_attn_mask = None slf_attn_mask_keypad = None slf_attn_mask = slf_attn_mask_subseq embd_output = self.tgt_word_emb(ys_in_pad) dec_output = self.positional_encoding( embd_output) ####has dropout inside by 0.1 for dec_layer in self.layer_stack: dec_output, dec_slf_attn, dec_enc_attn = dec_layer( dec_output, encoder_padded_outputs, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask, dec_enc_attn_mask=dec_enc_attn_mask) if return_attns: dec_slf_attn_list += [dec_slf_attn] dec_enc_attn_list += [dec_enc_attn] # before softmax #------------ dec_output = self.output_norm(dec_output) seq_logit = self.tgt_word_prj(dec_output) #--------------- pred, gold = seq_logit, ys_out_pad cost, CER = cal_performance(pred, gold, self.IGNORE_ID, normalize_length=False, smoothing=self.label_smoothing) #breakpoint() # output_dict={'cost':cost, 'CER':CER, 'smp_pred':pred,'smp_gold':gold} output_dict = { 'cost': cost, 'dec_slf_attn_list': dec_slf_attn_list, 'dec_enc_attn_list': dec_enc_attn_list, 'Char_cer': CER, 'Word_cer': CER } return output_dict