def loss(self, span_labels, seq_scores, seq_mask, span_map): loss = [] for k, scorer in self.scorers.items(): ls = scorer.loss(span_labels[k], seq_scores[k], seq_mask, span_map) loss.append(ls) loss = aggregate(torch.stack(loss), self.loss_reduction) return loss
def loss(self, labels, scores): loss = [] for k, classifier in self.classifiers.items(): k = self.from_key(k) ls = classifier.loss(labels[k], scores[k]) loss.append(ls) loss = aggregate(torch.stack(loss), self.loss_reduction) return loss
def loss(self, labels, top_scores, top_mask, top_indices): loss = [] for k, scorer in self.scorers.items(): ab = self.from_key(k) ls = scorer.loss(labels[ab], top_scores[ab], top_mask, top_indices) loss.append(ls) loss = aggregate(torch.stack(loss), self.loss_reduction) return loss
def loss(self, labels, scores, mask): loss = [] for k, scorer in self.scorers.items(): ls = scorer.loss(labels[k], scores[k], mask) loss.append(ls) _, pred = scores[k].max(-1) true = (labels[k] > 0).sum().tolist() pos = (pred > 0).sum().tolist() loss = aggregate(torch.stack(loss), self.loss_reduction) return loss
def loss(self, doc_labels, doc_scores, sent_labels=None, sent_scores=None, as_dict=False): doc_loss = OrderedDict() sent_loss = OrderedDict() for k, classifier in self.classifiers.items(): dl, sl = classifier.loss( \ doc_labels = doc_labels[k], doc_scores = doc_scores[k], sent_labels = None if sent_labels is None else sent_labels[k], sent_scores = None if sent_scores is None else sent_scores[k]) doc_loss[k] = dl sent_loss[k] = sl if as_dict: return (doc_loss, sent_loss) else: doc_loss = [v for k, v in doc_loss.items()] doc_loss = aggregate(torch.stack(doc_loss), self.loss_reduction) if self.use_sent_objective: sent_loss = [v for k, v in sent_loss.items()] sent_loss = aggregate(torch.stack(sent_loss), self.loss_reduction) else: sent_loss = None return (doc_loss, sent_loss)
def loss(self, y_true, y_pred, span_map=None): span_loss, role_loss = self.relation_extractor.loss( \ span_labels = y_true['span_labels'], span_scores = y_pred['span_scores'], span_mask = y_true['span_mask'], role_labels = y_true['role_labels'], top_role_scores = y_pred['top_role_scores'], top_span_mask = y_pred['top_span_mask'], top_indices = y_pred['top_indices'], ) loss_dict = OrderedDict() loss_dict["span_loss"] = span_loss loss_dict["role_loss"] = role_loss loss = torch.stack([v for k, v in loss_dict.items()]) loss = aggregate(loss, self.loss_reduction) return (loss, loss_dict)
def loss(self, y_true, y_pred, span_map=None): loss_dict = OrderedDict() if self.use_doc_classifier: doc_loss, sent_loss = self.doc_classifier.loss( \ doc_labels = y_true["doc_labels"], doc_scores = y_pred["doc_scores"], sent_labels = y_true["sent_labels"], sent_scores = y_pred["sent_scores"], as_dict = True) for k, v in doc_loss.items(): loss_dict[f"doc_{k}"] = v for k, v in sent_loss.items(): loss_dict[f"sent_{k}"] = v if self.use_span_classifier: span_labels = nest_dict(y_true["span_labels"]) span_scores = y_pred["span_scores"] span_mask = y_true["span_mask"] seq_mask = y_true["seq_mask"] seq_scores = y_pred["seq_scores"] role_labels = nest_dict(y_true["role_labels"]) top_role_scores = y_pred["top_role_scores"] top_span_mask = y_pred["top_span_mask"] top_indices = y_pred["top_indices"] span_loss = [] role_loss = [] for i in range(len(span_labels)): span_ls, role_ls = self.relation_extractor.loss( \ span_labels = span_labels[i], span_scores = span_scores[i], span_mask = span_mask[i], role_labels = role_labels[i], top_role_scores = top_role_scores[i], top_span_mask = top_span_mask[i], top_indices = top_indices[i], seq_scores = seq_scores[i], seq_mask = seq_mask[i], span_map = span_map ) span_loss.append(span_ls) role_loss.append(role_ls) span_loss = aggregate(torch.stack(span_loss), self.loss_reduction) role_loss = aggregate(torch.stack(role_loss), self.loss_reduction) loss_dict["span"] = span_loss loss_dict["role"] = role_loss loss_dict = OrderedDict([(k, v) for k, v in loss_dict.items() if v is not None]) loss = torch.stack([v for k, v in loss_dict.items()]) loss = aggregate(loss, self.loss_reduction) return (loss, loss_dict)
def fit(self, X, y, device=None, path=None, shuffle=True): ''' Parameters ---------- X: documents as list of strings [doc [str]] y: labels as list of dictionarys ''' logging.info('') logging.info('=' * 72) logging.info("Fit") logging.info('=' * 72) # Get/set device set_model_device(self, device) # Configure training mode self.train() # Create data set dataset = self.dataset_class( \ X = X, y = y, pretrained = self.pretrained, device = device, doc_definition = self.doc_definition, sent_definition = self.sent_definition, max_length = self.max_length, max_sent_count = self.max_sent_count, linebreak_bound = self.linebreak_bound, keep_ws = self.keep_ws) # Create data loader dataloader = DataLoader(dataset, \ shuffle = shuffle, batch_size = self.batch_size) # Create optimizer ''' https://github.com/huggingface/transformers/issues/657 pretrained = model.bert.parameters() # Get names of pretrained parameters (including `bert.` prefix) pretrained_names = [f'bert.{k}' for (k, v) in model.bert.named_parameters()] new_params= [v for k, v in model.named_parameters() if k not in pretrained_names] optimizer = AdamW( [{'params': pretrained}, {'params': new_params, 'lr': learning_rate * 10}], lr=learning_rate, ) ) ''' if self.lr_ratio == 1: optimizer = AdamW(self.parameters(), lr=self.lr) else: pretrained = self.bert.parameters() pretrained_names = [ f'bert.{k}' for (k, v) in self.bert.named_parameters() ] new_params = [ v for k, v in self.named_parameters() if k not in pretrained_names ] optimizer = AdamW([{ 'params': pretrained }, { 'params': new_params, 'lr': self.lr * self.lr_ratio }], lr=self.lr) # define cross entropy #cross_entropy = nn.NLLLoss(reduction=self.loss_reduction) # Create loss plotter plotter = PlotLoss(path=path) # Create prf aggregator prf_agg = PRFAggregator() # Loop on epochs pbar = tqdm(total=self.num_epochs) for j in range(self.num_epochs): loss_epoch = 0 losses_epoch = OrderedDict() prf = [] # Loop on mini-batches for i, (input_ids, attention_mask, doc_labels, sent_labels) in enumerate(dataloader): verbose = False #(i == 0) and (j == 0) # Reset gradients self.zero_grad() doc_scores, sent_scores = self(input_ids, attention_mask, verbose=verbose) loss_dict = OrderedDict() for k in doc_labels: loss_dict[f"doc_{k[0:3]}"] = F.cross_entropy( \ input = doc_scores[k], target = doc_labels[k], reduction = self.loss_reduction) if self.use_sent_objective: for k in doc_labels: ls = [] for t in sent_labels[k]: scores = sent_scores[k][t] labels = sent_labels[k][t] doc_count, sent_count, _ = tuple(scores.shape) scores = scores.view(doc_count * sent_count, -1) labels = labels.view(doc_count * sent_count) l = F.cross_entropy( \ input = scores, target = labels, reduction = self.loss_reduction) ls.append(l) ls = aggregate(torch.stack(ls), self.loss_reduction) loss_dict[f"sent_{k[0:3]}"] = ls loss = [v for k, v in loss_dict.items() if v is not None] loss = aggregate(torch.stack(loss), self.loss_reduction) plotter.update_batch(loss, loss_dict) #prf_agg.update_counts(self.perf_counts(y_true, y_pred)) # Backprop loss loss.backward() loss_epoch += loss.item() for k, v in loss_dict.items(): if i == 0: losses_epoch[k] = v.item() else: losses_epoch[k] += v.item() # Clip loss clip_grad_norm_(self.parameters(), self.grad_max_norm) # Update optimizer.step() plotter.update_epoch(loss_epoch, losses_epoch) msg = [] msg.append('epoch={}'.format(j)) msg.append('{}={:.1e}'.format('Total', loss_epoch)) for k, ls in losses_epoch.items(): msg.append('{}={:.1e}'.format(k, ls)) #msg.append(prf_agg.prf()) #prf_agg.reset() msg = ", ".join(msg) pbar.set_description(desc=msg) pbar.update() pbar.close() return True