def __init__(self, task, bert_model, marginalization, tau_gumbel_softmax, hard_gumbel_softmax, eps_gumbel_softmax, label_smoothing, soft_bert_score, mixed_proportion): super().__init__(task) self.bert_model = bert_model self.marginalization = marginalization self.bert_scorer = BERTScorer( self.bert_model, soft_bert_score=soft_bert_score) # , device='cpu') self.pad_token_id = self.bert_scorer._tokenizer.convert_tokens_to_ids( '[PAD]') # Gumbel-Softmax hyperparameters self.tau_gumbel_softmax = tau_gumbel_softmax self.hard_gumbel_softmax = hard_gumbel_softmax self.eps_gumbel_softmax = eps_gumbel_softmax # NLL parameters self.eps = label_smoothing # Cosine loss self.cos_loss = CosineEmbeddingLoss(reduction='sum') self._lambda = torch.tensor(mixed_proportion).to( self.bert_scorer.device) # File self.loss_stats_file = open('stats_mixed_nll_bert_sparsemax.txt', 'w') self.loss_stats_file.write('accuracy\tF_BERT\tLoss\n')
def __init__(self, student_config, teacher_config, device, args): self.mse_loss = MSELoss() self.kl_loss = KLDivLoss(reduction='batchmean') self.cosine_loss = CosineEmbeddingLoss() self.distill_config = student_config.distillation_config self.device = device self.student_config = student_config self.teacher_config = teacher_config self.batch_size = args.train_batch_size
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, input_labels=None, ): loss = defaultdict(float) outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequences_output = outputs[0] # bs x seq x hidden syn_labels = input_labels['syn_labels'] # bs positions = input_labels['positions'] # bs x 4 syn_features = self.extract_features(sequences_output, positions) # bs x hidden clf = self.syn_mse_clf if self.local_config['loss'] in { 'mseplus_loss', 'mse_loss' } else self.syn_clf syn_logits = clf(syn_features) # bs x 2 or bs if input_labels is not None: if self.local_config['loss'] != 'cosine_similarity': y_size = syn_logits.size(-1) else: y_size = -1 if y_size == 1: lossfn = MSELoss( ) if self.local_config['loss'] == 'mse_loss' else MSEPlusLoss( ) loss['total'] = lossfn(syn_logits, syn_labels.unsqueeze(-1).float()) elif self.local_config['loss'] == 'crossentropy_loss': loss['total'] = CrossEntropyLoss()(syn_logits, syn_labels) else: loss['total'] = CosineEmbeddingLoss()(syn_logits[0], syn_logits[1], syn_labels * 2 - 1) return (loss, syn_logits)
def __init__( self, optimizer_class=torch.optim.Adam, optim_wt_decay=0., epochs=5, regularization=None, loss_type='cos', all_senses=None, all_supersenses=None, elmo_class=None, # for sense vector in the model file_path="", device=device, **kwargs): ## Training parameters self.epochs = epochs self.elmo_class = elmo_class ## optimizer self.optimizer = optimizer_class self.optim_wt_decay = optim_wt_decay # taget word index and senses list self.all_senses = all_senses self.all_supersenses = all_supersenses self._init_kwargs = kwargs self.device = device # loss to calculate the similarity betwee two tensors if loss_type == 'mse': self.loss = MSELoss().to(self.device) else: self.loss = CosineEmbeddingLoss().to(self.device) ''' if regularization == "l1": self.regularization = L1Loss() elif regularization == "smoothl1": self.regularization = SmoothL1Loss() else: self.regularization = None ''' self.best_model_file = file_path + "word_sense_model_.pth" '''
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, sim_labels=None): sen1_attention_mask = (1 - token_type_ids) * attention_mask _, pooled_output_combined = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) pooled_output_combined = self.dropout(pooled_output_combined) _, pooled_output_sen1 = self.bert(input_ids, token_type_ids, sen1_attention_mask, output_all_encoded_layers=False) cos_sim = self.cosine(pooled_output_combined, pooled_output_sen1).unsqueeze(1) combined = torch.cat([pooled_output_combined, cos_sim], dim=1) logits = self.classifier(combined) if labels is not None: loss_fct = CrossEntropyLoss() loss_bert = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) #print("Labels:", labels[10:]) #new_labels = (1.0 - labels) + (labels * -1.0) #print("New Labels:", new_labels[10:]) loss_cosine = CosineEmbeddingLoss() loss_intent = loss_cosine(pooled_output_combined, pooled_output_sen1, sim_labels.float()) loss = self.alpha * loss_bert + (1 - self.alpha) * loss_intent return loss else: return logits
def __init__(self, task, bert_model, marginalization, tau_gumbel_softmax, hard_gumbel_softmax, eps_gumbel_softmax, soft_bert_score, force_alignment): super().__init__(task) self.bert_model = bert_model self.marginalization = marginalization self.force_alignment = force_alignment self.bert_scorer = BERTScorer(self.bert_model, soft_bert_score=soft_bert_score) # , device='cpu') self.pad_token_id = self.bert_scorer._tokenizer.convert_tokens_to_ids('[PAD]') # Gumbel-Softmax hyperparameters self.tau_gumbel_softmax = tau_gumbel_softmax self.hard_gumbel_softmax = hard_gumbel_softmax self.eps_gumbel_softmax = eps_gumbel_softmax # Cosine loss self.cos_loss = CosineEmbeddingLoss(reduction='sum') # self.cos_sim = CosineSimilarity(dim=1) # File self.loss_stats_file = open('stats_aligned_bert_'+self.marginalization+'.txt', 'w') self.loss_stats_file.write('accuracy\tBERT_loss\n')
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, input_ids2=None, attention_mask2=None, token_type_ids2=None, position_ids2=None, head_mask2=None, inputs_embeds2=None, labels2=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ _, outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, # position_ids=position_ids, # head_mask=head_mask, # inputs_embeds=inputs_embeds, ) _, outputs2 = self.bert( input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2, # position_ids=position_ids2, # head_mask=head_mask2, # inputs_embeds=inputs_embeds2, ) pooled_output = outputs pooled_output2 = outputs2 pooled_output = self.dropout(pooled_output) pooled_output2 = self.dropout(pooled_output2) # A series of different concatenations(concat(),|minus|,multiply, ...) final_output_cat = torch.cat((pooled_output, pooled_output2), 1) final_output_minus = torch.abs(pooled_output - pooled_output2) final_output_mult = torch.mul(pooled_output, pooled_output2) # final_output_mimu = torch.cat((final_output_minus, final_output_mult),1) # final_output_camu = torch.cat((final_output_cat, final_output_mult),1) # final_output_cami = torch.cat((final_output_cat, final_output_minus),1) final_output_camimu = torch.cat( (final_output_cat, final_output_minus, final_output_mult), 1) cos_pooled_outputs = torch.cosine_similarity(pooled_output, pooled_output2, dim=1) # 1 # torch.Size([hidden_size*2, 768]) # 2 # torch.Size([hidden_size, 768]) # 3 # torch.Size([hidden_size, 768]) # 4 # torch.Size([hidden_size*2, 768]) # 5 # torch.Size([hidden_size*3, 768]) # 6 # torch.Size([hidden_size*3, 768]) # 7 # torch.Size([hidden_size*4, 768]) # batch_size = list(pooled_output.size())[0] # hidden_size = list(pooled_output.size())[1] final_output_all = torch.cat( (final_output_camimu, cos_pooled_outputs.unsqueeze(1)), 1) logits_ce = self.classifier(final_output_all) # print('logits_ce:') # print(logits_ce) # logits_ori = self.classifier2(final_output_camimu) # print('logits_ori:') # print(logits_ori) #Calculate loss during training process if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct_ce = CrossEntropyLoss() loss_ce = loss_fct_ce(logits_ce.view(-1, self.num_labels), labels.view(-1)) # logger.info('loss_ce:') # logger.info(loss_ce) # loss_ori = loss_fct_ce(logits_ori.view(-1, self.num_labels), labels.view(-1)) # print('loss_ori:') # print(loss_ori) loss_fct_cos = CosineEmbeddingLoss() labels2[labels2 == 0] = -1 loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2) labels2[labels2 == -1] = 0 # labels2[labels2==1] = -1 # labels2[labels2==0] = 1 # loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2) # labels2[labels2== 1] = 0 # labels2[labels2==-1] = 1 # logger.info('loss_cos:') # logger.info(loss_cos) loss = loss_ce + loss_cos # logger.info('final loss:') # logger.info(loss) # outputs = (loss,) + outputs # outputs = (loss,) + logits_cos outputs = loss return outputs else: #Get predictions when doing evaluation return logits_ce
def CosineLoss(A, B): lossfunc = CosineEmbeddingLoss(margin=0.5) y = torch.tensor(1.0) if A.is_cuda: y = y.cuda() return lossfunc(A, B, target=y)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, entity_labels=None, checkpoint_activations=False): sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) # **YD** entity branch forward. entity_logits = self.entity_classifier(sequence_output) # **YD** may not require activation function entity_logits = self.activate(entity_logits) # entity_logits = F.normalize(entity_logits, 2, 2) # entity_logits = torch.matmul(entity_logits, self.entity_emb.weight.T) # entity_logits = torch.log(entity_logits) if labels is not None: loss_fct = CrossEntropyLoss() entity_loss_fct = CosineEmbeddingLoss() # Only keep active parts of the loss if attention_mask is not None: ''' active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] active_labels = labels.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) ''' active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) ner_loss = loss_fct(active_logits, active_labels) ''' entity_labels[entity_labels == _OUT_DICT_ENTITY_ID] = _IGNORE_CLASSIFICATION_LABEL assert entity_labels.requires_grad is False entity_active_logits = entity_logits.view(-1, self.num_entity_labels) entity_active_labels = torch.where( active_loss, entity_labels.view(-1), torch.tensor(entity_loss_fct.ignore_index).type_as(entity_labels) ) entity_loss = entity_loss_fct(entity_active_logits, entity_active_labels) ''' # entity_active_loss = (labels.view(-1) == NER_LABEL_DICT['B']) | active_loss entity_active_loss = (entity_labels.view(-1) > 0) entity_active_logits = entity_logits.view(-1, self.dim_entity_emb)[entity_active_loss] entity_active_labels = entity_labels.view(-1)[entity_active_loss] entity_loss = entity_loss_fct( entity_active_logits, self.entity_emb.weight[entity_active_labels], torch.tensor(1).type_as(entity_labels) ) print('ner_loss', ner_loss, 'entity_loss', entity_loss) if torch.isnan(entity_loss): loss = ner_loss else: loss = ner_loss + entity_loss assert not torch.isnan(loss) else: # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) raise ValueError("mask has to not None ") return loss else: return logits, entity_logits
def __init__(self, args, src_dict, tgt_dict, src_embedding, tgt_embedding, device): super(E2E, self).__init__(args) self.args = args self.src_dict = src_dict self.tgt_dict = tgt_dict # src_flow: assume tgt embeddings are transformed from the src mog space self.register_buffer('src_embedding', src_embedding) self.register_buffer('tgt_embedding', tgt_embedding) if args.init_var: # initialize with gaussian variance self.register_buffer("s2t_s_var", src_dict.var) self.register_buffer("s2t_t_var", tgt_dict.var) self.register_buffer("t2s_s_var", src_dict.var) self.register_buffer("t2s_t_var", tgt_dict.var) else: self.s2t_s_var = args.s_var self.s2t_t_var = args.s2t_t_var self.t2s_t_var = args.t_var self.t2s_s_var = args.t2s_s_var self.register_buffer('src_freqs', torch.tensor(src_dict.freqs, dtype=torch.float)) self.register_buffer('tgt_freqs', torch.tensor(tgt_dict.freqs, dtype=torch.float)) # backward: t2s self.src_flow = MogFlow_batch(args, self.t2s_s_var) # backward: s2t self.tgt_flow = MogFlow_batch(args, self.s2t_t_var) self.s2t_valid_dico = None self.t2s_valid_dico = None self.device = device # use dict pairs from train data (supervise) or identical words (supervise_id) as supervisions self.supervise = args.supervise_id if self.supervise: self.load_training_dico() if args.sup_obj == 'mse': self.sup_loss_func = nn.MSELoss() elif args.sup_obj == 'cosine': self.sup_loss_func = CosineEmbeddingLoss() optim_fn, optim_params = get_optimizer(args.flow_opt_params) self.flow_optimizer = optim_fn( list(self.src_flow.parameters()) + list(self.tgt_flow.parameters()), **optim_params) self.flow_scheduler = torch.optim.lr_scheduler.ExponentialLR( self.flow_optimizer, gamma=args.lr_decay) self.best_valid_metric = 1e-12 self.sup_sw = args.sup_s_weight self.sup_tw = args.sup_t_weight self.mse_loss = nn.MSELoss() self.cos_loss = CosineEmbeddingLoss() # Evaluation on trained model if args.load_from_pretrain_s2t != "" or args.load_from_pretrain_t2s != "": self.load_from_pretrain()
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, input_ids2=None, attention_mask2=None, token_type_ids2=None, position_ids2=None, head_mask2=None, inputs_embeds2=None, labels2=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ _, outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, # position_ids=position_ids, # head_mask=head_mask, # inputs_embeds=inputs_embeds, ) _, outputsC = self.bert( input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2, # position_ids=position_ids2, # head_mask=head_mask2, # inputs_embeds=inputs_embeds2, ) # print("Careful, outputs:") # print(outputs) # print(outputsC) pooled_output = outputs pooled_outputC = outputsC pooled_output = self.dropout(pooled_output) # pooled_outputC = self.dropout(pooled_outputC) cos_pooled_outputs = torch.cosine_similarity(pooled_output, pooled_outputC, dim=1) # print('pooled_output size:') # print(pooled_output.size()) # print(pooled_output) # print('cos_pooled_outputs size:') # print(cos_pooled_outputs.size()) # print(cos_pooled_outputs) batch_size = list(pooled_output.size())[0] hidden_size = list(pooled_output.size())[1] # logits_ce = self.classifier2(pooled_outputC) # print('logits_ce:') # print(logits_ce) ## v2: concat ## v3: multiply ## v4: v2 & ce_cos_similarity ## v5: v3 & ce_cos_similarity # print(torch.cat((pooled_output, cos_pooled_outputs.unsqueeze(1)),1)) # print((pooled_output*cos_pooled_outputs.unsqueeze(1))) logits_cos = self.classifier(torch.cat((pooled_output, cos_pooled_outputs.unsqueeze(1)),1)) logits_final = logits_cos # logits_cos = self.classifier2((pooled_output*cos_pooled_outputs.unsqueeze(1))) # self.classifier = torch.nn.Linear(hidden_size+batch_size, 2).to(device) # logits_cos = self.classifier(torch.cat((pooled_output, cos_pooled_outputs.repeat(batch_size,1)),1)) # print('logits_cos:') # print(logits_cos) # logits = self.classifier(pooled_output) # logitsC = self.classifier(pooled_outputC) # outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here # print(logits) # print('xd') # print(outputs[2:]) # outputs = (logits,) + outputs[2:] # print("labels:") # print(labels) if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: # loss_fct_ce = CrossEntropyLoss() # loss_ce = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss_fct_ce = CrossEntropyLoss() # print('pooled_output size:') # print(pooled_output.size()) loss_ce = loss_fct_ce(logits_final.view(-1, self.num_labels), labels.view(-1)) # loss_ce = loss_fct_ce(pooled_output.view(-1), labels.view(-1)) # print('loss_ce:') # print(loss_ce) loss_fct_cos = CosineEmbeddingLoss() labels2[labels2==0] = -1 loss_cos = loss_fct_cos(pooled_output, pooled_outputC, labels2) labels2[labels2==-1] = 0 # loss_fct_cos = CosineEmbeddingLoss() # # print(labels) # # print(pooled_outputC) # # labels2[labels2==0] = -1 # loss_cos = loss_fct_cos(pooled_output, pooled_outputC, labels2) # loss_cos = loss_fct_cos(logits_ce, logits_cos, labels2) # loss_cos = loss_fct_ce(logits_cos.view(-1, self.num_labels), labels2.view(-1)) # print('loss_cos:') # print(loss_cos) loss = loss_cos+loss_ce # print('final loss:') # print(loss) # logits = self.classifier(loss) # outputs = (loss,) + outputs # outputs = (loss,) + logits_cos outputs = loss return outputs else: return logits_final
def getPredictionLossFn(cl=None, net=None): kldivLoss = KLDivLoss() mseLoss = MSELoss() smoothl1Loss = SmoothL1Loss() tripletLoss = TripletMarginLoss() #TripletLoss() cosineLoss = CosineEmbeddingLoss(margin=0.5) if PREDICTION_LOSS == 'MSE': def prediction_loss(predFeature, nextFeature): return mseLoss(predFeature, nextFeature) elif PREDICTION_LOSS == 'SMOOTHL1': def prediction_loss(predFeature, nextFeature): return smoothl1Loss(predFeature, nextFeature) elif PREDICTION_LOSS == 'TRIPLET': def prediction_loss(predFeature, nextFeature, negativeFeature=None, cl=cl, net=net): if not negativeFeature: negatives, _, _ = cl.randomSamples(1) #predFeature.size(0)) negativeFeature = net( Variable(negatives[0], requires_grad=False).cuda(), Variable(negatives[1], requires_grad=False).cuda()).detach() return tripletLoss(predFeature.unsqueeze(0), nextFeature.unsqueeze(0), negativeFeature) elif PREDICTION_LOSS == 'COSINE': def prediction_loss(predFeature, nextFeature, negativeFeature=None, cl=cl, net=net): if not negativeFeature: negatives, _, _ = cl.randomSamples(1) #predFeature.size(0)) negativeFeature = net( Variable(negatives[0], requires_grad=False).cuda(), Variable(negatives[1], requires_grad=False).cuda()).detach() else: negativeFeature = negativeFeature.unsqueeze(0) predFeature = predFeature.unsqueeze(0) nextFeature = nextFeature.unsqueeze(0) # concat positive and negative features # create targets for concatenated positives and negatives input1 = torch.cat([predFeature, predFeature], dim=0) input2 = torch.cat([nextFeature, negativeFeature], dim=0) target1 = Variable(torch.ones(predFeature.size(0)), requires_grad=False).detach().cuda() target2 = -target1 target = torch.cat([target1, target2], dim=0) return cosineLoss(input1, input2, target) else: def prediction_loss(predFeature, nextFeature): return kldivLoss(F.log_softmax(predFeature), F.softmax(nextFeature)) return prediction_loss
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, input_ids2=None, attention_mask2=None, token_type_ids2=None, position_ids2=None, head_mask2=None, inputs_embeds2=None, labels2=None, input_ids3=None, attention_mask3=None, token_type_ids3=None, position_ids3=None, head_mask3=None, inputs_embeds3=None, labels3=None # input_ids4=None, # attention_mask4=None, # token_type_ids4=None, # position_ids4=None, # head_mask4=None, # inputs_embeds4=None, # labels4=None ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import BertTokenizer, BertForSequenceClassification import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ # Pers rep _, outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, # position_ids=position_ids, # head_mask=head_mask, # inputs_embeds=inputs_embeds, ) # Claim rep _, outputs2 = self.bert( input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2, # position_ids=position_ids2, # head_mask=head_mask2, # inputs_embeds=inputs_embeds2, ) # Opp Pers rep _, outputs3 = self.bert( input_ids3, attention_mask=attention_mask3, token_type_ids=token_type_ids3, # position_ids=position_ids2, # head_mask=head_mask2, # inputs_embeds=inputs_embeds2, ) # Opp Claim rep # _, outputs4 = self.bert( # input_ids4, # attention_mask=attention_mask4, # token_type_ids=token_type_ids4, # # position_ids=position_ids2, # # head_mask=head_mask2, # # inputs_embeds=inputs_embeds2, # ) pooled_output = outputs pooled_output2 = outputs2 pooled_output3 = outputs3 # pooled_output4 = outputs4 pooled_output = self.dropout(pooled_output) pooled_output2 = self.dropout(pooled_output2) pooled_output3 = self.dropout(pooled_output3) # pooled_output4 = self.dropout(pooled_output4) # A series of different concatenations(concat(),|minus|,multiply, ...) final_output_cat = torch.cat((pooled_output2, pooled_output), 1) final_output_minus = torch.abs(pooled_output2 - pooled_output) final_output_mult = torch.mul(pooled_output2, pooled_output) # final_output_mimu = torch.cat((final_output_minus, final_output_mult),1) # final_output_camu = torch.cat((final_output_cat, final_output_mult),1) # final_output_cami = torch.cat((final_output_cat, final_output_minus),1) final_output_camimu = torch.cat( (final_output_cat, final_output_minus, final_output_mult), 1) cos_pooled_outputs = torch.cosine_similarity(pooled_output2, pooled_output, dim=1) # ocop_final_output_cat = torch.cat((pooled_output4, pooled_output3),1) # ocop_final_output_minus = torch.abs(pooled_output4-pooled_output3) # ocop_final_output_mult = torch.mul(pooled_output4, pooled_output3) # final_output_mimu = torch.cat((final_output_minus, final_output_mult),1) # final_output_camu = torch.cat((final_output_cat, final_output_mult),1) # final_output_cami = torch.cat((final_output_cat, final_output_minus),1) # ocop_final_output_camimu = torch.cat((ocop_final_output_cat, ocop_final_output_minus, ocop_final_output_mult),1) # ocop_cos_pooled_outputs = torch.cosine_similarity(pooled_output4, pooled_output3, dim=1) cop_final_output_cat = torch.cat((pooled_output2, pooled_output3), 1) cop_final_output_minus = torch.abs(pooled_output2 - pooled_output3) cop_final_output_mult = torch.mul(pooled_output2, pooled_output3) # final_output_mimu = torch.cat((final_output_minus, final_output_mult),1) # final_output_camu = torch.cat((final_output_cat, final_output_mult),1) # final_output_cami = torch.cat((final_output_cat, final_output_minus),1) cop_final_output_camimu = torch.cat( (cop_final_output_cat, cop_final_output_minus, cop_final_output_mult), 1) cop_cos_pooled_outputs = torch.cosine_similarity(pooled_output2, pooled_output3, dim=1) # ocp_final_output_cat = torch.cat((pooled_output4, pooled_output),1) # ocp_final_output_minus = torch.abs(pooled_output4-pooled_output) # ocp_final_output_mult = torch.mul(pooled_output4, pooled_output) # final_output_mimu = torch.cat((final_output_minus, final_output_mult),1) # final_output_camu = torch.cat((final_output_cat, final_output_mult),1) # final_output_cami = torch.cat((final_output_cat, final_output_minus),1) # ocp_final_output_camimu = torch.cat((ocp_final_output_cat, ocp_final_output_minus, ocp_final_output_mult),1) # ocp_cos_pooled_outputs = torch.cosine_similarity(pooled_output4, pooled_output, dim=1) # 1 # torch.Size([hidden_size*2, 768]) # 2 # torch.Size([hidden_size, 768]) # 3 # torch.Size([hidden_size, 768]) # 4 # torch.Size([hidden_size*2, 768]) # 5 # torch.Size([hidden_size*3, 768]) # 6 # torch.Size([hidden_size*3, 768]) # 7 # torch.Size([hidden_size*4, 768]) batch_size = list(pooled_output.size())[0] hidden_size = list(pooled_output.size())[1] final_output_all = torch.cat( (final_output_camimu, cos_pooled_outputs.unsqueeze(1)), 1) cop_final_output_all = torch.cat( (cop_final_output_camimu, cop_cos_pooled_outputs.unsqueeze(1)), 1) # ocp_final_output_all = torch.cat((ocp_final_output_camimu, ocp_cos_pooled_outputs.unsqueeze(1)),1) # ocop_final_output_all = torch.cat((ocop_final_output_camimu, ocop_cos_pooled_outputs.unsqueeze(1)),1) logits_ce = self.classifier(final_output_all) # ocop_logits_ce = self.classifier(ocop_final_output_all) cop_logits_ce = self.classifier(cop_final_output_all) # ocp_logits_ce = self.classifier(ocp_final_output_all) # best_score = 0 # logits_grid = [] # for ori in (list(np.arange(0,2.5,0.5))+[10,100,1000]): # for cop in (list(np.arange(0,2.5,0.5))+[10,100,1000]): # for ocp in (list(np.arange(0,2.5,0.5))+[10,100,1000]): # for ocop in (list(np.arange(0,2.5,0.5))+[10,100,1000]): # logits_grid.append((ori*logits_ce)-(cop*cop_logits_ce)-(ocp*ocp_logits_ce)+(ocop*ocop_logits_ce)) #### grid search end # if input_ids4 and input_ids3: final_logits = (1 * logits_ce) - (1 * cop_logits_ce) # elif input_ids3: # final_logits = logits_ce-(0.33*cop_logits_ce) # elif input_ids4: # final_logits = logits_ce-(0.33*ocp_logits_ce) # else: # final_logits = logits_ce # print('logits_ce:') # print(logits_ce) # logits_ori = self.classifier2(final_output_camimu) # print('logits_ori:') # print(logits_ori) #Calculate loss during training process if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(final_logits.view(-1), labels.view(-1)) else: loss_fct_ce = CrossEntropyLoss() loss_ce = loss_fct_ce(final_logits.view(-1, self.num_labels), labels.view(-1)) # logger.info('loss_ce:') # logger.info(loss_ce) # loss_ori = loss_fct_ce(logits_ori.view(-1, self.num_labels), labels.view(-1)) # print('loss_ori:') # print(loss_ori) loss_fct_cos = CosineEmbeddingLoss() loss_fct_tri = TripletLoss() # labels2[labels2==0] = -1 # loss_cos = loss_fct_cos(pooled_output, pooled_output2, labels2) # labels2[labels2==-1] = 0 k = 0 index = [] for i in labels: k = k + 1 if i == 0: index.append(k) pooled_output_inter = pooled_output.clone().detach() pooled_output3_inter = pooled_output3.clone().detach() pooled_output_inter2 = pooled_output.clone().detach() pooled_output3_inter2 = pooled_output3.clone().detach() for l in index: pooled_output_inter[l - 1], pooled_output3_inter[ l - 1] = pooled_output3_inter[l - 1], pooled_output_inter[l - 1] for l in index: pooled_output3_inter2[l - 1], pooled_output_inter2[ l - 1] = pooled_output_inter2[l - 1], pooled_output3_inter2[l - 1] loss_tri = loss_fct_tri(pooled_output2, pooled_output_inter, pooled_output3_inter2) loss = loss_ce + loss_tri # logger.info('final loss:') # logger.info(loss) # outputs = (loss,) + outputs # outputs = (loss,) + logits_cos outputs = loss return outputs else: #Get predictions when doing evaluation return final_logits
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, **kwargs): sequence_output, _ = self._bert_model.bert( input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) prediction_scores = self._bert_model.cls(sequence_output) if masked_lm_labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-1, reduction='sum') masked_lm_loss = loss_fct( prediction_scores.view(-1, self.bert_config.vocab_size), masked_lm_labels.view(-1)) ## YS loss = masked_lm_loss ## YS if 'input_ref_ids' in kwargs: input_ref_ids = kwargs['input_ref_ids'] sequence_ref_output, _ = self._bert_model.bert( input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) ## Similarity loss between [CLS] tokens. Cosine similarity with in-batch negative samples sim_loss_fct = CosineEmbeddingLoss(margin=0, reduction='mean') _bs, _seq_len, _bert_dim = sequence_output.size() _t_output = sequence_output[:, 0, :].unsqueeze(0).expand( _bs, _bs, _bert_dim).reshape(_bs * _bs, _bert_dim) _t_ref_output = sequence_output[:, 0, :].unsqueeze(1).expand( _bs, _bs, _bert_dim).reshape(_bs * _bs, _bert_dim) _y = torch.tensor(np.eye(_bs) * 2 - np.ones((_bs, _bs)), dtype=sequence_output.dtype, device=sequence_output.device).view(_bs * _bs) sim_loss = sim_loss_fct(_t_output, _t_ref_output, _y) loss += sim_loss sample_size = masked_lm_labels.ne(-1).sum().item() logging_output = { 'sample_size': sample_size, 'mlm_loss': masked_lm_loss.item(), 'loss': loss.item() } if 'input_ref_ids' in kwargs: logging_output['sim_loss'] = sim_loss.item() return loss, logging_output else: return prediction_scores
def forward(self, output1, output2, is_diff): target = (1. - 2. * is_diff).float() # map [0,1] -> [1, -1] cos = CosineEmbeddingLoss(margin=self.margin, reduction=self.reduction) return cos(output1, output2, target)