def predict(self, X, device=None, path=None): logging.info('') logging.info('=' * 72) logging.info("Predict") logging.info('=' * 72) # Get/set device set_model_device(self, device) # Configure training mode self.eval() # Create data set dataset = self.dataset_class( \ X = X, pretrained = self.pretrained, device = device, doc_definition = self.doc_definition, sent_definition = self.sent_definition, max_length = self.max_length, max_sent_count = self.max_sent_count, linebreak_bound = self.linebreak_bound, keep_ws = self.keep_ws) # Create data loader dataloader = DataLoader(dataset, \ shuffle = False, batch_size = self.batch_size) # deactivate autograd with torch.no_grad(): pbar = tqdm(total=int(len(dataloader) / dataloader.batch_size)) y = [] for i, (input_ids, attention_mask) in enumerate(dataloader): verbose = False # Push data through model doc_scores, sent_scores = self(input_ids, attention_mask, verbose=verbose) y_batch = dataset.postprocess_y( \ attention_mask = attention_mask, doc_scores = doc_scores, sent_scores = sent_scores, ) y.extend(y_batch) pbar.update() pbar.close() if path is not None: f = os.path.join(path, PREDICTIONS_FILE) joblib.dump(y, f) return y
def predict(self, X, device=None, path=None): logging.info('') logging.info('=' * 72) logging.info("Predict") logging.info('=' * 72) # Do not shuffle shuffle = False # Get/set device set_model_device(self, device) # Configure training mode self.eval() # Set number of cores torch.set_num_threads(self.num_workers) # Create data set dataset = self.dataset_class(X, **self.dataset_params, device=device) # Create data loader dataloader = DataLoader(dataset, shuffle=False, **self.dataloader_params) pbar = tqdm(total=int(len(dataloader) / dataloader.batch_size)) y = [] for i, (indices, seq_tensor, seq_mask, span_indices, span_mask) in enumerate(dataloader): verbose = False # Push data through model out = self( \ seq_tensor = seq_tensor, seq_mask = seq_mask, span_indices = span_indices, span_mask = span_mask, verbose = verbose) y_batch = dataset.postprocess_y( \ indices = indices, span_scores = out["span_scores"], span_mask = span_mask, role_scores = out["top_role_scores"], role_span_mask = out["top_span_mask"], role_indices = out["top_indices"], ) y.extend(y_batch) pbar.update() pbar.close() return y
def encode_documents(encoded_dict, pretrained, \ word_pieces_keep = None, device = None, train = False, detach = True, move_to_cpu = True, max_length = None): model = AutoModel.from_pretrained(pretrained) if train: model.train() else: model.eval() set_model_device(model, device) if word_pieces_keep is None: word_pieces_keep = [None for _ in encoded_dict] assert len(word_pieces_keep) == len(encoded_dict) logging.info("Encoding documents...") X = [] mask =[] pbar = tqdm(total=len(encoded_dict)) for encoded, wp_keep in zip(encoded_dict, word_pieces_keep): x, m = encode_document( \ encoded_dict = encoded, model = model, word_pieces_keep = wp_keep, device = device, detach = detach, move_to_cpu = move_to_cpu, max_length = max_length) X.append(x) mask.append(m) pbar.update() pbar.close() return (X, mask)
def fit(self, X, y, device=None, path=None, shuffle=True): logging.info('') logging.info('=' * 72) logging.info("Fit") logging.info('=' * 72) # Get/set device set_model_device(self, device) # Configure training mode self.train() # Set number of cores torch.set_num_threads(self.num_workers) # Create data set dataset = self.dataset_class(X, y=y, **self.dataset_params, device=device) # Create data loader dataloader = DataLoader(dataset, shuffle=shuffle, **self.dataloader_params) # Create optimizer optimizer = optim.Adam(self.parameters(), **self.optimizer_params) # Create loss plotter plotter = PlotLoss(path=path) # Create prf aggregator prf_agg = PRFAggregator() # Loop on epochs pbar = tqdm(total=self.num_epochs) for j in range(self.num_epochs): loss_epoch = 0 losses_epoch = OrderedDict() prf = [] # Loop on mini-batches for i, (indices, seq_tensor, seq_mask, span_indices, span_mask, y_true) in enumerate(dataloader): verbose = False #(i == 0) and (j == 0) # Reset gradients self.zero_grad() y_pred = self( \ seq_tensor = seq_tensor, seq_mask = seq_mask, span_indices = span_indices, span_mask = span_mask, verbose = verbose) loss, loss_dict = self.loss(y_true, y_pred) plotter.update_batch(loss, loss_dict) prf_agg.update_counts(self.perf_counts(y_true, y_pred)) # Backprop loss loss.backward() loss_epoch += loss.item() for k, v in loss_dict.items(): if i == 0: losses_epoch[k] = v.item() else: losses_epoch[k] += v.item() # Clip loss clip_grad_norm_(self.parameters(), self.hyperparams['grad_max_norm']) # Update optimizer.step() plotter.update_epoch(loss_epoch, losses_epoch) msg = [] msg.append('epoch={}'.format(j)) msg.append('{}={:.1e}'.format('Total', loss_epoch)) for k, ls in losses_epoch.items(): msg.append('{}={:.1e}'.format(k, ls)) msg.append(prf_agg.prf()) prf_agg.reset() msg = ", ".join(msg) pbar.set_description(desc=msg) pbar.update() print() pbar.close() return True
def predict(self, X, device=None, path=None, return_prob=False): logging.info('') logging.info('='*72) logging.info("Predict") logging.info('='*72) # Get/set device set_model_device(self, device) # Configure training mode self.eval() # Set number of cores torch.set_num_threads(self.num_workers) # Create data set dataset = self.dataset_class(X, **self.dataset_params, device=device) # Create data loader dataloader = DataLoader(dataset, shuffle=False, **self.dataloader_params) pbar = tqdm(total=int(len(dataloader)/dataloader.batch_size)) y = [] #for i, (doc_indices, seq_tensor, seq_mask, span_indices, span_mask) in enumerate(dataloader): for i, (doc_indices, seq_tensor, seq_mask) in enumerate(dataloader): verbose = False # Push data through model #out = self(seq_tensor, seq_mask, span_indices, span_mask, verbose=verbose) out = self(seq_tensor, seq_mask, verbose=verbose) if return_prob: y_batch = dataset.postprocess_y_prob( \ doc_indices = doc_indices, seq_mask = seq_mask, doc_scores = out["doc_scores"] ) else: y_batch = dataset.postprocess_y( \ doc_indices = doc_indices, seq_mask = seq_mask, doc_scores = out["doc_scores"], sent_scores = out["sent_scores"], #span_scores = out["span_scores"], #span_mask = span_mask, #role_scores = out["top_role_scores"], #role_span_mask = out["top_span_mask"], #role_indices = out["top_indices"], ) y.extend(y_batch) pbar.update() pbar.close() if path is not None: f = os.path.join(path, PREDICTIONS_FILE) joblib.dump(y, f) return y
def fit(self, X, y, device=None, path=None, shuffle=True): ''' Parameters ---------- X: documents as list of strings [doc [str]] y: labels as list of dictionarys ''' logging.info('') logging.info('=' * 72) logging.info("Fit") logging.info('=' * 72) # Get/set device set_model_device(self, device) # Configure training mode self.train() # Create data set dataset = self.dataset_class( \ X = X, y = y, pretrained = self.pretrained, device = device, doc_definition = self.doc_definition, sent_definition = self.sent_definition, max_length = self.max_length, max_sent_count = self.max_sent_count, linebreak_bound = self.linebreak_bound, keep_ws = self.keep_ws) # Create data loader dataloader = DataLoader(dataset, \ shuffle = shuffle, batch_size = self.batch_size) # Create optimizer ''' https://github.com/huggingface/transformers/issues/657 pretrained = model.bert.parameters() # Get names of pretrained parameters (including `bert.` prefix) pretrained_names = [f'bert.{k}' for (k, v) in model.bert.named_parameters()] new_params= [v for k, v in model.named_parameters() if k not in pretrained_names] optimizer = AdamW( [{'params': pretrained}, {'params': new_params, 'lr': learning_rate * 10}], lr=learning_rate, ) ) ''' if self.lr_ratio == 1: optimizer = AdamW(self.parameters(), lr=self.lr) else: pretrained = self.bert.parameters() pretrained_names = [ f'bert.{k}' for (k, v) in self.bert.named_parameters() ] new_params = [ v for k, v in self.named_parameters() if k not in pretrained_names ] optimizer = AdamW([{ 'params': pretrained }, { 'params': new_params, 'lr': self.lr * self.lr_ratio }], lr=self.lr) # define cross entropy #cross_entropy = nn.NLLLoss(reduction=self.loss_reduction) # Create loss plotter plotter = PlotLoss(path=path) # Create prf aggregator prf_agg = PRFAggregator() # Loop on epochs pbar = tqdm(total=self.num_epochs) for j in range(self.num_epochs): loss_epoch = 0 losses_epoch = OrderedDict() prf = [] # Loop on mini-batches for i, (input_ids, attention_mask, doc_labels, sent_labels) in enumerate(dataloader): verbose = False #(i == 0) and (j == 0) # Reset gradients self.zero_grad() doc_scores, sent_scores = self(input_ids, attention_mask, verbose=verbose) loss_dict = OrderedDict() for k in doc_labels: loss_dict[f"doc_{k[0:3]}"] = F.cross_entropy( \ input = doc_scores[k], target = doc_labels[k], reduction = self.loss_reduction) if self.use_sent_objective: for k in doc_labels: ls = [] for t in sent_labels[k]: scores = sent_scores[k][t] labels = sent_labels[k][t] doc_count, sent_count, _ = tuple(scores.shape) scores = scores.view(doc_count * sent_count, -1) labels = labels.view(doc_count * sent_count) l = F.cross_entropy( \ input = scores, target = labels, reduction = self.loss_reduction) ls.append(l) ls = aggregate(torch.stack(ls), self.loss_reduction) loss_dict[f"sent_{k[0:3]}"] = ls loss = [v for k, v in loss_dict.items() if v is not None] loss = aggregate(torch.stack(loss), self.loss_reduction) plotter.update_batch(loss, loss_dict) #prf_agg.update_counts(self.perf_counts(y_true, y_pred)) # Backprop loss loss.backward() loss_epoch += loss.item() for k, v in loss_dict.items(): if i == 0: losses_epoch[k] = v.item() else: losses_epoch[k] += v.item() # Clip loss clip_grad_norm_(self.parameters(), self.grad_max_norm) # Update optimizer.step() plotter.update_epoch(loss_epoch, losses_epoch) msg = [] msg.append('epoch={}'.format(j)) msg.append('{}={:.1e}'.format('Total', loss_epoch)) for k, ls in losses_epoch.items(): msg.append('{}={:.1e}'.format(k, ls)) #msg.append(prf_agg.prf()) #prf_agg.reset() msg = ", ".join(msg) pbar.set_description(desc=msg) pbar.update() pbar.close() return True
def encode_documents(input_ids, mask, \ pretrained=PRETRAINED, device=None, train=False): logging.info("Embedding using AutoModel") model = AutoModel.from_pretrained(pretrained) if train: model.train() else: model.eval() set_model_device(model, device) X = [] masks = [] pbar = tqdm(total=len(input_ids)) assert len(input_ids) == len(mask) for i, (ids, msk) in enumerate(zip(input_ids, mask)): ids = set_tensor_device(ids, device) msk = set_tensor_device(msk, device) x = model( \ ids, token_type_ids=None, attention_mask=msk)[0] x = x.cpu().detach() X.append(x) if i == 1: logging.info("Encode documents") #logging.info("-"*80) #logging.info("") #logging.info('IDs: {}\n{}'.format(ids.shape, ids)) logging.info('IDs: {}'.format(ids.shape)) #logging.info("") #logging.info('Mask: {}\n{}'.format(msk.shape, msk)) logging.info('Mask: {}'.format(msk.shape)) #logging.info("") #logging.info('X: {}\n{}'.format(x.shape, x)) logging.info('X: {}'.format(x.shape)) logging.info('') #logging.info("") #logging.info("-"*80) pbar.update() pbar.close() logging.info("") logging.info('Document count: {}'.format(len(X))) logging.info("") return X