def train_epoch(self, train_dataloader): for step, batch in enumerate(tqdm(train_dataloader, desc="Training")): self.model.train() batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids, segment_ids, input_mask) if self.args.is_multilabel: loss = F.binary_cross_entropy_with_logits( logits, label_ids.float()) else: loss = F.cross_entropy(logits, torch.argmax(label_ids, dim=1)) if self.args.n_gpu > 1: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() self.tr_loss += loss.item() self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: lr_this_step = self.args.learning_rate * warmup_linear( self.iterations / self.num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.optimizer.zero_grad() self.iterations += 1
def train_epoch(self, train_dataloader, freez_layer=None): loss_epoch = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Training")): self.model.train() if freez_layer: self.freez(freez_layer) batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids, segment_ids, input_mask) loss_extra = 0 if isinstance(logits, tuple): logits, (first_SEP, second_SEP) = logits cos_simi = F.cosine_similarity(first_SEP, second_SEP) for i in range(len(label_ids)): if torch.eq(label_ids[i], torch.Tensor([0, 1]).long().cuda()).all(): loss_extra += 1 - cos_simi[i] elif torch.eq(label_ids[i], torch.Tensor([1, 0]).long().cuda()).all(): loss_extra += max(0, cos_simi[i]) else: print('Invalid label value ERROR', label_ids[i]) exit(1) if self.args.is_multilabel: loss = F.binary_cross_entropy_with_logits( logits, label_ids.float()) else: loss = F.cross_entropy(logits, torch.argmax(label_ids, dim=1)) #print( 'loss extra: ', loss_extra) loss += loss_extra if self.args.n_gpu > 1: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() self.tr_loss += loss.item() loss_epoch += loss.item() self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: lr_this_step = self.args.learning_rate * warmup_linear( self.iterations / self.num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.optimizer.zero_grad() self.iterations += 1 #print('train loss', np.mean(tr_loss)) #print('avg grads', np.mean(grads)) return loss_epoch / (step + 1)
def train_epoch(self, train_dataloader): loss_epoch = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Training")): self.model.train() batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, segment_ids,sent_scores, label_ids = batch #print('sent scores in bert_sent_trainer',sent_scores.shape, sent_scores[:,-10:]) '''print('label ids in bert_sent_trainer', label_ids[:-10]) print('input ids in bert_sent_trainer', input_ids[:-10]) print('input mask in bert_sent_trainer', input_mask[:-10]) print('segment ids in bert_sent_trainer', segment_ids[:-10])''' logits = self.model(input_ids, segment_ids, input_mask, sent_scores=sent_scores) loss_extra = 0 if isinstance(logits, tuple): logits , (first_SEP , second_SEP) = logits cos_simi = F.cosine_similarity(first_SEP, second_SEP) for i in range(len(label_ids)): if torch.eq(label_ids[i], torch.Tensor([0,1]).long().cuda()).all(): loss_extra += 1 - cos_simi[i] elif torch.eq(label_ids[i], torch.Tensor([1, 0]).long().cuda()).all(): loss_extra += max(0, cos_simi[i]) else: print('Invalid label value ERROR', label_ids[i]) exit(1) '''import copy model_copied = copy.deepcopy(self.model) for p in model_copied.parameters(): if p.requires_grad: p.detach_()''' if self.args.is_multilabel: loss = F.binary_cross_entropy_with_logits(logits, label_ids.float()) else: loss = F.cross_entropy(logits, torch.argmax(label_ids, dim=1)) loss += loss_extra if self.args.n_gpu > 1: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() self.tr_loss += loss.item() loss_epoch += loss.item() self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: lr_this_step = self.args.learning_rate * warmup_linear(self.iterations / self.num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.optimizer.zero_grad() self.iterations += 1 return loss_epoch / (step+1)
def train_epoch(self, train_dataloader): self.tr_loss = 0 predicted_labels, target_labels = list(), list() for step, batch in enumerate(tqdm(train_dataloader, desc="Training")): self.model.train() batch = tuple(t.to(self.args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = self.model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids)[0] if self.args.is_multilabel: predicted_labels.extend( F.sigmoid(logits).round().long().cpu().detach().numpy()) target_labels.extend(label_ids.cpu().detach().numpy()) if self.args.loss == 'cross-entropy': if self.args.pos_weights: pos_weights = [ float(w) for w in self.args.pos_weights.split(',') ] pos_weight = torch.FloatTensor(pos_weights) else: pos_weight = torch.ones([self.args.num_labels]) criterion = torch.nn.BCEWithLogitsLoss( pos_weight=pos_weight) criterion = criterion.to(self.args.device) loss = criterion(logits, label_ids.float()) elif self.args.loss == 'mse': criterion = torch.nn.MSELoss() criterion = criterion.to(self.args.device) m = torch.nn.Sigmoid() m.to(self.args.device) loss = criterion(m(logits), label_ids.float()) else: if self.args.num_labels > 2: predicted_labels.extend( torch.argmax(logits, dim=1).cpu().detach().numpy()) target_labels.extend(label_ids.cpu().detach().numpy()) loss = F.cross_entropy(logits, torch.argmax(label_ids, dim=1)) else: if self.args.is_regression: predicted_labels.extend( logits.view(-1).cpu().detach().numpy()) target_labels.extend( label_ids.view(-1).cpu().detach().numpy()) criterion = torch.nn.MSELoss() loss = criterion(logits.view(-1), label_ids.view(-1)) else: criterion = torch.nn.CrossEntropyLoss() loss = criterion(logits.view(-1, self.args.num_labels), label_ids.view(-1)) if self.args.n_gpu > 1: loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: self.optimizer.backward(loss) else: loss.backward() self.tr_loss += loss.item() self.nb_tr_steps += 1 if (step + 1) % self.args.gradient_accumulation_steps == 0: if self.args.fp16: lr_this_step = self.args.learning_rate * warmup_linear( self.iterations / self.num_train_optimization_steps, self.args.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() self.iterations += 1 if self.args.evaluate_train: rmse, kendall, pearson, spearman, pearson_spearman = evaluate_for_regression( target_labels, predicted_labels) print('\n' + LOG_HEADER_REG) print( LOG_TEMPLATE_REG.format('TRAIN', rmse, kendall, pearson, spearman, pearson_spearman, self.tr_loss))