def fit( self, df_train_features: pd.DataFrame, df_train_tokens_reader: pd.io.parsers.TextFileReader, df_train_label: pd.DataFrame, df_val_features: pd.DataFrame, df_val_tokens_reader: pd.io.parsers.TextFileReader, df_val_label: pd.DataFrame, save_filename: str, cat_feature_set: set, normalize: bool = True, train_batches_to_skip: int = 0, val_batches_to_skip: int = 0, pretrained_model_dict_path: str = None, pretrained_optimizer_dict_path: str = None, ): self.df_train_label = df_train_label self.df_val_label = df_val_label print(df_train_features) print(df_val_features) assert len(df_train_features.columns) == len( df_val_features.columns ), "df_train_features and df_val_features have different number of columns" if normalize: df_train_features = self._normalize_features(df_train_features, is_train=True) df_val_features = self._normalize_features(df_val_features) print(df_train_features) print(df_val_features) gpu = torch.cuda.is_available() if gpu: torch.cuda.manual_seed_all(self.seed_val) ffnn_input_size = HIDDEN_SIZE_BERT + df_train_features.shape[1] self.model = self._get_model(ffnn_input_size=ffnn_input_size) if pretrained_model_dict_path is not None: print(f"Loading pretrained model : {pretrained_model_dict_path}") self.model.load_state_dict(torch.load(pretrained_model_dict_path)) if gpu: self.model.cuda() # freeze all bert layers # for param in self.model.bert.parameters(): # param.requires_grad = False # Combine the training inputs into a TensorDataset. train_dataset = CustomDatasetCap( class_label=self.class_label, df_features=df_train_features, df_tokens_reader=df_train_tokens_reader, df_label=df_train_label, cap=self.cap_length, batches_to_skip=train_batches_to_skip) val_dataset = CustomDatasetCap(class_label=self.class_label, df_features=df_val_features, df_tokens_reader=df_val_tokens_reader, df_label=df_val_label, cap=self.cap_length, batches_to_skip=val_batches_to_skip) train_dataloader, validation_dataloader = create_data_loaders( train_dataset, val_dataset, batch_size=df_train_tokens_reader.chunksize) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( optimizer_grouped_parameters, lr=self. lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=self.eps # args.adam_epsilon - default is 1e-8. ) if pretrained_optimizer_dict_path is not None: print( f"Loading pretrained optimizer : {pretrained_optimizer_dict_path}" ) optimizer.load_state_dict( torch.load(pretrained_optimizer_dict_path)) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * self.epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # Measure the total training time for the whole run. total_t0 = time.time() # For each epoch... for epoch_i in range(0, self.epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format( epoch_i + 1, self.epochs)) print('Training...') avg_train_loss, training_time, prauc_train, rce_train = self.train( self.model, train_dataloader, optimizer, scheduler) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") avg_val_accuracy, avg_val_loss, validation_time, prauc_val, rce_val = self.validation( model=self.model, validation_dataloader=validation_dataloader) # Record all statistics from this epoch. curr_stats = { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'PRAUC train': prauc_train, 'RCE train': rce_train, 'PRAUC val': prauc_val, 'RCE val': rce_val, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time } training_stats.append(curr_stats) pathlib.Path('./saved_models').mkdir(parents=True, exist_ok=True) model_path = f"./saved_models/saved_model_{save_filename}" optimizer_path = f"./saved_models/saved_optimizer_{save_filename}" print(f"Saving model : {model_path}") torch.save(self.model.state_dict(), model_path) torch.save(optimizer.state_dict(), optimizer_path) bot_string = f"DistilBertDoubleInput NN - {self.class_label} \n ---------------- \n" bot_string = bot_string + str(self.model) bot_string = bot_string + "Weight decay: " + str( self.weight_decay) + "\n" bot_string = bot_string + "Learning rate: " + str(self.lr) + "\n" bot_string = bot_string + "Epsilon: " + str( self.eps) + "\n ---------------- \n" bot_string = bot_string + "\n".join( [key + ": " + str(curr_stats[key]) for key in curr_stats]) + "\n\n" bot_string = bot_string + "Saved to : " + model_path #telegram_bot_send_update(bot_string) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) return training_stats
def train(self, model, train_dataloader, optimizer, scheduler): # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 #total_train_prauc = 0 #total_train_rce = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() preds = None labels = None # For each batch of training data... for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)): # Progress update every 40 batches. #if step % 40 == 0 and not step == 0: # # Calculate elapsed time in minutes. # elapsed = format_time(time.time() - t0) # # Report progress. # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) b_labels = batch[3].to(self.device) #print("b_labels:",b_labels.shape) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits, curr_preds = model( input_ids=b_input_ids, input_features=b_features, # token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.item() #total_train_prauc += prauc #total_train_rce += rce curr_preds = curr_preds.detach().cpu().numpy() if preds is None: preds = curr_preds else: preds = np.vstack([preds, curr_preds]) curr_labels = b_labels.detach().cpu().numpy() if labels is None: labels = curr_labels else: labels = np.hstack([labels, curr_labels]) #print(f"batch {step} RCE: {rce}") #print(f"batch {step} PRAUC: {prauc}") # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) #avg_train_prauc = total_train_prauc / len(train_dataloader) #avg_train_rce = total_train_rce / len(train_dataloader) prauc, rce, conf, max_pred, min_pred, avg = self.evaluate( preds=preds, labels=labels) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) #print(" Average training PRAUC: {0:.5f}".format(avg_train_prauc)) #print(" Average training RCE: {0:.5f}".format(avg_train_rce)) print(f"STATS FOR CURRENT EPOCH" f"\nPRAUC : {prauc}" f"\nRCE : {rce}" f"\nMIN : {min_pred}" f"\nMAX : {max_pred}" f"\nAVG : {avg}") print(" Training epoch took: {:}".format(training_time)) return avg_train_loss, training_time, prauc, rce
def validation(self, model, validation_dataloader): t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 #total_eval_prauc = 0 #total_eval_rce = 0 nb_eval_steps = 0 preds = None labels = None # Measure how long the training epoch takes. t0 = time.time() # Evaluate data for one epoch for step, batch in tqdm(enumerate(validation_dataloader), total=len(validation_dataloader)): # Progress update every 40 batches. #if step % 40 == 0 and not step == 0: # # Calculate elapsed time in minutes. # elapsed = format_time(time.time() - t0) # # Report progress. # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(validation_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) b_labels = batch[3].to(self.device) #print("b_labels:",b_labels.shape) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. loss, logits, curr_preds = model( input_ids=b_input_ids, input_features=b_features, # token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) curr_preds = curr_preds.detach().cpu().numpy() if preds is None: preds = curr_preds else: preds = np.vstack([preds, curr_preds]) # Accumulate the validation loss. total_eval_loss += loss.item() #total_eval_prauc += prauc #total_eval_rce += rce # print(f"current batch RCE: {rce}") # print(f"current batch PRAUC: {prauc}") # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() if labels is None: labels = label_ids else: labels = np.hstack([labels, label_ids]) # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += flat_accuracy(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) #avg_val_prauc = total_eval_prauc / len(validation_dataloader) #avg_val_rce = total_eval_rce / len(validation_dataloader) #print("debug") prauc, rce, conf, max_pred, min_pred, avg = self.evaluate( preds=preds, labels=labels) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) #print(" Validation PRAUC: {0:.5f}".format(avg_val_prauc)) #print(" Validation RCE: {0:.5f}".format(avg_val_rce)) print(f"STATS FOR VALIDATION" f"\nPRAUC : {prauc}" f"\nRCE : {rce}" f"\nMIN : {min_pred}" f"\nMAX : {max_pred}" f"\nAVG : {avg}") print(" Validation took: {:}".format(validation_time)) return avg_val_accuracy, avg_val_loss, validation_time, prauc, rce
def validation(self, model, validation_dataloader): t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_loss = 0 preds_list = [None] * 4 labels_list = [None] * 4 # Measure how long the training epoch takes. t0 = time.time() # Evaluate data for one epoch for step, batch in tqdm(enumerate(validation_dataloader), total=len(validation_dataloader)): # Progress update every 40 batches. #if step % 40 == 0 and not step == 0: # # Calculate elapsed time in minutes. # elapsed = format_time(time.time() - t0) # # # Report progress. # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(validation_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: features # [3]: labels b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_features = batch[2].to(self.device) b_labels = batch[3].to(self.device) # print("b_labels:",b_labels.shape) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. output_list = model( input_ids=b_input_ids, input_features=b_features, attention_mask=b_input_mask, labels=b_labels, ) loss = output_list[0][0] total_eval_loss += loss.item() for i in range(4): curr_preds = output_list[i][2] if preds_list[i] is None: preds_list[i] = curr_preds else: preds_list[i] = np.hstack([preds_list[i], curr_preds]) curr_labels = b_labels.detach().cpu().numpy()[:, i] if labels_list[i] is None: labels_list[i] = curr_labels else: labels_list[i] = np.hstack([labels_list[i], curr_labels]) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(validation_dataloader) print(f"VALIDATION STATISTICS FOR EPOCH") for i in range(4): prauc, rce, conf, max_pred, min_pred, avg = self.evaluate( preds=preds_list[i], labels=labels_list[i]) if i == 0: print("\n------- LIKE -------") elif i == 1: print("\n------- RETWEET -------") elif i == 2: print("\n------- REPLY -------") elif i == 3: print("\n------- COMMENT -------") print(f"PRAUC : {prauc}" f"\nRCE : {rce}" f"\nMIN : {min_pred}" f"\nMAX : {max_pred}" f"\nAVG : {avg}") # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) return avg_val_loss, validation_time