def main(): # コマンドライン引数の取得(このファイル上部のドキュメントから自動生成) args = docopt(__doc__) pprint(args) # パラメータの取得 weights = args['<pretrained_weights>'] dir_name = args['--dir_name'] lr = float(args['--lr']) seq_len = int(args['--seq_len']) max_epoch = int(args['--max_epoch']) batch_size = int(args['--batch_size']) num_train = int(args['--num_train']) num_valid = int(args['--num_valid']) # 学習済みモデルの選択 weights = args['<pretrained_weights>'] # モデルアーキテクチャの選択 if weights == 'bert-base-uncased': tokenizer = BertTokenizer.from_pretrained(weights) config = BertConfig(num_labels=4) model = BertForSequenceClassification.from_pretrained(weights, config=config) elif weights == 'bert-large-uncased': tokenizer = BertTokenizer.from_pretrained(weights) config = BertConfig(hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, intermediate_size=4096, num_labels=4) model = BertForSequenceClassification.from_pretrained(weights, config=config) elif weights in ['albert-base-v1', 'albert-large-v1', 'albert-base-v2', 'albert-large-v2']: tokenizer = AlbertTokenizer.from_pretrained(weights) config = AlbertConfig.from_pretrained(weights, num_labels=4) model = AlbertForSequenceClassification.from_pretrained(weights, config=config) # 使用デバイスの取得 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # データの読み込みとデータセットの作成 encoder = TwinPhraseEncoder(tokenizer, seq_len) train_dataset = WordnetDataset(mode='train', num_data=num_train, transform=encoder) valid_dataset = WordnetDataset(mode='valid', num_data=num_valid, transform=encoder) train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True) valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True) # 最適化法の定義 optimizer = optim.Adam(model.parameters(), lr=lr) # 保存用ディレクトリの作成 log_dir = f'../logs/{dir_name}' model_dir = f'../models/{dir_name}' os.makedirs(log_dir , exist_ok=True) os.makedirs(model_dir, exist_ok=True) # 学習ログの記録(TensorBoard) train_writer = SummaryWriter(log_dir=f'{log_dir}/train') valid_writer = SummaryWriter(log_dir=f'{log_dir}/valid') # 学習 for epoch in range(1, max_epoch+1): print('='*27 + f' Epoch {epoch:0>2} ' + '='*27) # Training loss, accu = train_model(model, optimizer, train_loader, device) print(f'| Training | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |') train_writer.add_scalar('loss', loss, epoch) train_writer.add_scalar('accu', accu, epoch) # Validation loss, accu = valid_model(model, optimizer, valid_loader, device) print(f'| Validation | loss-avg : {loss:>8.6f} | accuracy : {accu:>8.3%} |') valid_writer.add_scalar('loss', loss, epoch) valid_writer.add_scalar('accu', accu, epoch) # モデルの保存 torch.save(model.state_dict(), f'{model_dir}/epoch-{epoch:0>2}.pkl') train_writer.close() valid_writer.close()
batch_size=batch_size) # Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # Load BertForSequenceClassification, the pretrained Bert model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "dkleczek/bert-base-polish-cased-v1", # Use the 12-layer Bert model, with an uncased vocab. num_labels=2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states=False, # Whether the model returns all hidden-states. attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1) # Tell pytorch to run this model on the GPU. model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( model.parameters(), lr=5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8, # args.adam_epsilon - default is 1e-8. weight_decay=0.05)
from transformers import BertForSequenceClassification, AdamW from utils.config import bert_config_from_file config = bert_config_from_file('conf/bert.json') model = BertForSequenceClassification.from_pretrained( config['pretrained'], cache_dir=config['cache_dir'], num_labels=5) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
def __init__(self, weight="bert-base-japanese-whole-word-masking"): self.weight = weight self.tokenizer = BertJapaneseTokenizer.from_pretrained(self.weight) self.model = BertForSequenceClassification.from_pretrained(self.weight)
def __init__(self, options_name: str = "bert-base-cased"): super(BERT, self).__init__() options_name = options_name self.encoder = BertForSequenceClassification.from_pretrained( options_name)
import torch from transformers import BertTokenizer, BertForSequenceClassification import torch.nn as nn import csv import pickle import os from pycocotools.coco import COCO # load model state_dict = torch.load('./checkpoints/moco.p') # for para in model: # print(para,"\t",model[para].size()) # create model net = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=80, output_attentions=False, output_hidden_states=False, ) # for para in net.state_dict(): # print(para,"\t",net.state_dict()[para].size()) # fc_features = net.classifier.in_features # net.classifier = nn.Linear(fc_features,2) # load parameters #net.load_state_dict(state_dict) # for para in net.state_dict(): # print(para,"\t",net.state_dict()[para].size())
def load_model(self, path_model, path_config): # self.model = BertForSequenceClassification(BertConfig(path_config), num_labels=self.num_classes,output_attentions = False, output_hidden_states = True) self.model = BertForSequenceClassification.from_pretrained(path_model) self.tokenizer = BertTokenizer.from_pretrained(path_model) # self.model.load_state_dict(torch.load(path_model)) self.__init_model()
# Train and evaluate using tf.keras.Model.fit() train_steps = train_examples//BATCH_SIZE valid_steps = valid_examples//EVAL_BATCH_SIZE history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps, validation_data=valid_dataset, validation_steps=valid_steps) # Save TF2 model os.makedirs('./save/', exist_ok=True) model.save_pretrained('./save/') if TASK == "mrpc": # Load the TensorFlow model in PyTorch for inspection # This is to demo the interoperability between the two frameworks, you don't have to # do this in real life (you can run the inference on the TF model). pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task sentence_0 = 'This research was consistent with his findings.' sentence_1 = 'His findings were compatible with this research.' sentence_2 = 'His findings were not compatible with this research.' inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') del inputs_1["special_tokens_mask"] del inputs_2["special_tokens_mask"] pred_1 = pytorch_model(**inputs_1)[0].argmax().item() pred_2 = pytorch_model(**inputs_2)[0].argmax().item() print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0') print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
def main(): # 1. Basic setup parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--pytorch-only', action='store_true', default=False, help='disables ONNX Runtime training') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--view-graphs', action='store_true', default=False, help='views forward and backward graphs') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--epochs', type=int, default=4, metavar='N', help='number of epochs to train (default: 4)') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument( '--log-interval', type=int, default=40, metavar='N', help= 'how many batches to wait before logging training status (default: 40)' ) parser.add_argument( '--train-steps', type=int, default=-1, metavar='N', help= 'number of steps to train. Set -1 to run through whole dataset (default: -1)' ) parser.add_argument( '--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING', help='Log level (default: WARNING)') parser.add_argument( '--num-hidden-layers', type=int, default=1, metavar='H', help= 'Number of hidden layers for the BERT model. A vanila BERT has 12 hidden layers (default: 1)' ) parser.add_argument('--data-dir', type=str, default='./cola_public/raw', help='Path to the bert data directory') args = parser.parse_args() # Device (CPU vs CUDA) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.log_level) logging.basicConfig(level=numeric_level) # 2. Dataloader train_dataloader, validation_dataloader = load_dataset(args) # 3. Modeling # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. config = AutoConfig.from_pretrained( "bert-base-uncased", num_labels=2, num_hidden_layers=args.num_hidden_layers, output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. config=config, ) if not args.pytorch_only: # Just for future debugging debug_options = DebugOptions( save_onnx=False, onnx_prefix='BertForSequenceClassificationAutoCast') model = ORTModule(model, debug_options) model._torch_module._execution_manager( is_training=True)._enable_grad_acc_optimization = True # Tell pytorch to run this model on the GPU. if torch.cuda.is_available() and not args.no_cuda: model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Authors recommend between 2 and 4 epochs # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) scaler = torch.cuda.amp.GradScaler() # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) if torch.cuda.is_available() and not args.no_cuda: torch.cuda.manual_seed_all(args.seed) # 4. Train loop (fine-tune) total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0 for epoch_i in range(0, args.epochs): total_training_time += train(model, optimizer, scaler, scheduler, train_dataloader, epoch_i, device, args) if not args.pytorch_only and epoch_i == 0: epoch_0_training = total_training_time test_time, validation_accuracy = test(model, validation_dataloader, device, args) total_test_time += test_time assert validation_accuracy > 0.5 print('\n======== Global stats ========') if not args.pytorch_only: estimated_export = 0 if args.epochs > 1: estimated_export = epoch_0_training - ( total_training_time - epoch_0_training) / (args.epochs - 1) print(" Estimated ONNX export took: {:.4f}s".format( estimated_export)) else: print( " Estimated ONNX export took: Estimate available when epochs > 1 only" ) print(" Accumulated training without export took: {:.4f}s".format( total_training_time - estimated_export)) print(" Accumulated training took: {:.4f}s".format( total_training_time)) print(" Accumulated validation took: {:.4f}s".format( total_test_time))
help="logging steps") parser.add_argument("--learning_rate", type=float, default=5e-5, help="learning rate") parser.add_argument( "--output_dir", type=str, default='./checkpoints', help="directory where check points are saved during training") args = parser.parse_args() # Load the BERT base cased tokenizer and pre-trained model tokenizer = BertTokenizerFast.from_pretrained(args.model_type) seq_model = BertForSequenceClassification.from_pretrained(args.model_type, num_labels=2) # Load the dataset training_texts, training_spans = load_dataset(args.train_dir) val_texts, val_spans = load_dataset(args.dev_dir) test_texts, test_spans = load_dataset(args.test_dir) # Split each post into sentences training_sentences, training_labels, training_sentence_spans, tr_post_to_sentence_num = split_into_setences( training_texts, training_spans) val_sentences, val_labels, val_sentence_spans, val_post_to_sentence_num = split_into_setences( val_texts, val_spans) test_sentences, test_labels, test_sentence_spans, test_post_to_sentence_num = split_into_setences( test_texts, test_spans) # Tokenize the sentences
def dist_train(rank, world_size, data_dir, epochs): logger.info("Running on rank: %i", rank) seed = 1234 lr = 2e-5 batch_size = 32 accumulation_steps = 1 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.cuda.set_device(rank) logger.info('loading data..') train_dataset, val_dataset = load_data(data_dir, seed) y_columns = ['target'] # use torch.utils.data.distributed.DistributedSampler here to create a sampler. train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, # uncomment this once you have your sampler working # sampler=train_sampler ) logger.info("len of train_loader: %i", len(train_loader)) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=len(y_columns)).cuda() # move model to GPU with id rank # use nn.parallel.DistributedDataParallel to put your model on multiple GPUs # replace the line below with your own ddp_model = model param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lr) tq = trange(epochs) for epoch in tq: ddp_model.train() ddp_train(rank, world_size, ddp_model, train_loader, optimizer, accumulation_steps) if rank == 0: torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
def CV(input_ids, attention_masks, pred_labels, dataset, FILE, device): # ------------------------------------------------------------------- # Use train_test_split to split our data into train and validation sets for # training for LABEL in pred_labels: print("working on", LABEL, "\n") labels = dataset[LABEL].values max_f1 = 0 # Use 90% for training and 10% for validation. train_inputs, validation_inputs, train_labels, validation_labels = \ train_test_split(input_ids, labels, random_state=2018, test_size=0.1) # Do the same for the masks. train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1) validation_accuracies = [] validation_precisions = [] validation_f1s = [] validation_recalls = [] num_folds = 10 skf = StratifiedKFold(n_splits=num_folds, shuffle=True) fold = 1 for train_idx, test_idx in skf.split(input_ids, labels): print("----------------------------fold = " + str(fold) + "-----------------------------") fold += 1 train_inputs = input_ids[train_idx] train_labels = labels[train_idx] train_masks = [attention_masks[idx] for idx in train_idx] validation_inputs = input_ids[test_idx] validation_labels = labels[test_idx] validation_masks = [attention_masks[idx] for idx in test_idx] # ------------------------------------------------------------------- # Convert all inputs and labels into torch tensors, the required datatype # for our model. train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # ------------------------------------------------------------------- # This section is basically aiming to save runtime memory using torch DataLoader class :) # The DataLoader needs to know our batch size for training, so we specify it # here. # For fine-tuning BERT on a specific task, the authors recommend a batch size of # 16 or 32. batch_size = 32 # Create the DataLoader for our training set. train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # ------------------------------------------------------------------- # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels=6, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states=False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() # ------------------------------------------------------------------- # Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) # print('The BERT model has {:} different named parameters.\n'.format(len(params))) # # print('==== Embedding Layer ====\n') # # for p in params[0:5]: # print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) # # print('\n==== First Transformer ====\n') # # for p in params[5:21]: # print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) # # print('\n==== Output Layer ====\n') # # for p in params[-4:]: # print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) # ------------------------------------------------------------------- """ For the purposes of fine-tuning, the authors recommend choosing from the following values: Batch size: 16, 32 (We chose 32 when creating our DataLoaders). Learning rate (Adam): 5e-5, 3e-5, 2e-5 (We'll use 2e-5). Number of epochs: 2, 3, 4 (We'll use 4). """ # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Number of training epochs (authors recommend between 2 and 4) epochs = 2 # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # ------------------------------------------------------------------- # Function to calculate the accuracy of our predictions vs labels def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) def flat_precision(preds, labels, num_labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return precision_score(labels_flat, pred_flat, average="micro") def flat_recall(preds, labels, num_labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return recall_score(labels_flat, pred_flat, average="micro") def format_time(elapsed): ''' Takes a time in seconds and returns a string hh:mm:ss ''' # Round to the nearest second. elapsed_rounded = int(round((elapsed))) # Format as hh:mm:ss return str(datetime.timedelta(seconds=elapsed_rounded)) # ------------------------------------------------------------------- # This training code is based on the `run_glue.py` script here: # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Store the average loss after each epoch so we can plot them. loss_values = [] # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because we # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0] # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(format_time(time.time() - t0))) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables eval_loss, eval_accuracy, eval_precision, eval_recall = 0, 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # Evaluate data for one epoch for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions. # This will return the logits rather than the loss because we have # not provided labels. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. tmp_eval_accuracy = flat_accuracy(logits, label_ids) tmp_eval_precision = flat_precision(logits, label_ids) tmp_eval_recall = flat_recall(logits, label_ids) # Accumulate the total accuracy. eval_accuracy += tmp_eval_accuracy eval_precision += tmp_eval_precision eval_recall += tmp_eval_recall # Track the number of batches nb_eval_steps += 1 # Report the final accuracy for this validation run. print(" Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps)) validation_accuracies.append((eval_accuracy / nb_eval_steps)) print(" Precision: {0:.2f}".format(eval_precision / nb_eval_steps)) validation_precisions.append((eval_precision / nb_eval_steps)) print(" Recall: {0:.2f}".format(eval_recall / nb_eval_steps)) validation_recalls.append((eval_recall / nb_eval_steps)) temp_precision = validation_precisions[-1] temp_recall = validation_recalls[-1] if temp_precision * temp_recall == 0: F1 = 0 else: F1 = 2 * temp_precision * temp_recall / (temp_precision + temp_recall) validation_f1s.append(F1) print(" F1: {0:.2f}".format(F1)) if F1 > max_f1: model.save_pretrained("model_" + LABEL) max_f1 = F1 print(" Validation took: {:}".format(format_time(time.time() - t0))) print("") print("Training complete!") # ------------------------------------------------------------------- from numpy import asarray from numpy import savetxt # define data data = asarray(validation_recalls) # save to csv file savetxt(FILE + "_" + LABEL + '_validation_recalls.csv', data, delimiter=',') # define data data = asarray(validation_precisions) # save to csv file savetxt(FILE + "_" + LABEL + '_validation_precision.csv', data, delimiter=',') data = asarray(validation_accuracies) # save to csv file savetxt(FILE + "_" + LABEL + '_validation_accuracies.csv', data, delimiter=',') data = asarray(validation_f1s) # save to csv file savetxt(FILE + "_" + LABEL + '_validation_F1.csv', data, delimiter=',') # Use plot styling from seaborn. sns.set(style='darkgrid') # Increase the plot size and font size. sns.set(font_scale=1.5) plt.rcParams["figure.figsize"] = (12, 6) # Plot the learning curve. plt.plot(loss_values, 'b-o') # Label the plot. plt.title("Training loss") plt.xlabel("Epoch") plt.ylabel("Loss") plt.savefig(LABEL + "_" + FILE + "_" + "Loss.png")
def train(): """Fine-tune bert using IMDB data""" # deal with warnings for now os.system('clear') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('using gpu:', torch.cuda.get_device_name(gpu_num)) sentences, labels = load_data() print('loaded %d examples and %d labels...' % (len(sentences), len(labels))) # tokenize and convert to ints tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", truncating="post", padding="post") # create attention masks attention_masks = [] for seq in input_ids: # use 1s for tokens and 0s for padding seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) # make validation set train_inputs, validation_inputs, train_labels, validation_labels = \ train_test_split(input_ids, labels, test_size=0.1, random_state=0) train_masks, validation_masks, _, _ = \ train_test_split(attention_masks, input_ids, test_size=0.1, random_state=0) # convert everything into torch tensors train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # create iterators for our data train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # load pretrained bert model with a single linear classification layer on top model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] # this variable contains all of the hyperparemeter information our training loop needs optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # store our loss and accuracy for plotting train_loss_set = [] # training loop for epoch in trange(epochs, desc="epoch"): # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # train for one epoch for step, batch in enumerate(train_dataloader): # add batch to GPU batch = tuple(t.to(device) for t in batch) # unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # clear out the gradients (by default they accumulate) optimizer.zero_grad() # forward pass loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) # backward pass loss.backward() # update parameters and take a step using the computed gradient torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("epoch: {}, loss: {}".format(epoch, tr_loss / nb_tr_steps)) # put model in evaluation mode to evaluate loss on the validation set model.eval() # tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # evaluate data for one epoch for batch in validation_dataloader: # add batch to GPU batch = tuple(t.to(device) for t in batch) # unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # don't compute or store gradients with torch.no_grad(): # forward pass; only logits returned since labels not provided [logits] = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print("validation accuracy: {}\n".format(eval_accuracy / nb_eval_steps))
if args.prepare_data: data_utils.prepare_data(args.task, path=data_path, sequence_length=args.sequence_length) logs_path = 'logs' # Logs path if not os.path.exists(logs_path): os.makedirs(logs_path) save_path = data_utils.get_save_dir(logs_path, args.name) if not torch.cuda.is_available(): print('GPU not available. Running on CPU...') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', cache_dir=CACHE_DIR).to(device) supervised_train_dataset = data_utils.SupervisedDataset( os.path.join(data_path, args.task, 'train_uda_ids.pt')) supervised_validation_dataset = data_utils.SupervisedDataset( os.path.join(data_path, args.task, 'val_uda_ids.pt')) unsupervised_dataset = data_utils.UnsupervisedDataset( os.path.join(data_path, args.task, 'unsup_ori_uda_ids.pt'), os.path.join(data_path, args.task, 'unsup_aug_uda_ids.pt')) supervised_train_dataloader = DataLoader(supervised_train_dataset, batch_size=args.batch_size, shuffle=True) supervised_validation_dataloader = DataLoader( supervised_validation_dataset, batch_size=args.batch_size,
sample_idx = idx else: sample_idx = idx - dset.cumulative_sizes[dataset_idx - 1] g.write(f'{dataset_idx},{sample_idx}\n') train_ds = subsets[0] train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_transformer) val_ds = subsets[1] validation_evaluator = ClassificationEvaluator(val_ds, device) # Create the model model = BertForSequenceClassification.from_pretrained( bert_model, config=bert_config).to(device) if args.pretrained_model is not None: weights = { k: v for k, v in torch.load(args.pretrained_model).items() if "classifier" not in k } model_dict = model.state_dict() model_dict.update(weights) model.load_state_dict(model_dict) # Create the optimizer no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters()
def __init__(self, model_path): self.model = BertForSequenceClassification.from_pretrained(model_path) self.tokenizer = BertTokenizer.from_pretrained(model_path) self.labels_map = self.model.config.id2label
# Set the directories where the models will be saved bert_dir = args.pretrain # Generate the data loader test_dataset = NaverSentimentDataset(train=False) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=1, shuffle=False, collate_fn=test_dataset.collate_fn, num_workers=2, ) # Generate question answering model using pretrained bert model = BertForSequenceClassification.from_pretrained(bert_dir) model = model.to(device) # Evaluation total_test_loss = 0.0 total_test_accuracy = 0 model.eval() with torch.no_grad(): for input_ids, token_type_ids, attention_mask, rating in tqdm( test_loader): input_ids = input_ids.to(device) token_type_ids = token_type_ids.to(device) attention_mask = attention_mask.to(device) rating_bool = rating.bool().to(device) rating = rating.float().to(device)
def fit(self, sentences, labels, model_save_path, device, do_lower_case=True, debug=False, max_length=512, add_special_tokens=True, test_size=0.1, batch_size=8, output_attentions=True, output_hidden_states=True, epochs=2): ''' - sentences : input string (as numpy array) - labels : numerical label (as numpy array) labels should be 0,1,2 like that Example on how you can obtain labels = train_data['labels'].values - test_size: is validation size (train and validation split basically) ''' print('Loading BERT tokenizer...') # Load the BERT tokenizer. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=do_lower_case) if debug: # Print the original sentence. print(' Original: ', sentences[0]) # Print the sentence split into tokens. print('Tokenized: ', tokenizer.tokenize(sentences[0])) # Print the sentence mapped to token ids. print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))) input_ids, attention_masks = self.get_inputid_attentionmasks(tokenizer, sentences, debug=False, max_length=max_length, add_special_tokens=True) # Use 90% for training and 10% for validation. train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=self.random_state, test_size=test_size) # Do the same for the masks train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,random_state=self.random_state, test_size=test_size) train_data, train_sampler, train_dataloader = self.create_data(train_inputs, train_masks, train_labels, batch_size) validation_data, validation_sampler, validation_dataloader = self.create_data(validation_inputs, validation_masks, validation_labels,batch_size) # Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = self.NUM_CLASS, output_attentions=output_attentions, output_hidden_states=output_hidden_states) model.cuda() # Tell pytorch to run this model on the GPU. # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * epochs # Note: AdamW is a class from the huggingface library (as opposed to pytorch) I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8 ) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) print("") print('Training Batches: ',len(train_dataloader)) print('Validation Batches: ',len(validation_dataloader)) print('Batch Size: ',batch_size) print('Epochs: ',epochs) random.seed(self.seed_val) np.random.seed(self.seed_val) torch.manual_seed(self.seed_val) torch.cuda.manual_seed_all(self.seed_val) # Store the average loss after each epoch so we can plot them. loss_values = [] logits_list, label_ids_list = [], [] model = self.train_val_loop(model, epochs, train_dataloader, optimizer, scheduler, validation_dataloader, model_save_path, custom_name = '_') return model, tokenizer, device, max_length
def __init__(self): super(BERT, self).__init__() options_name = "bert-base-uncased" self.encoder = BertForSequenceClassification.from_pretrained(options_name, num_labels = 2)
def load_pretrained_model(self, model_name): self.model = BertForSequenceClassification.from_pretrained( model_name, num_labels=len(self.label_dict), output_attentions=False, output_hidden_states=False)
def main(): # 创建模型 model = BertForSequenceClassification.from_pretrained( model_name, num_labels=2, mirror="tuna") # num_labels表示2个分类,好评和差评 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 加载数据 if model_name == "../bert-base-uncased": tokenizer = AlbertTokenizer.from_pretrained(model_name) else: tokenizer = BertTokenizer.from_pretrained(model_name, mirror="tuna") train_dataloader, test_dataloader = dataload.solve_data( tokenizer, dataname=DataName, limit_size=Limit_size, BATCH_SIZE=BATCH_SIZE, using_clickbait_dic=USING_CLICKBAIT_DIC, ) # training steps 的数量: [number of batches] x [number of epochs]. total_steps = len(train_dataloader) * epochs # 定义优化方法 no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=epsilon) # 设计 learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) records_Train = {} records_Test = {} best_epoch = -1 best_score = -1 for epoch in range(epochs): train_loss, train_acc, train_precision, train_recall, train_F1 = train( model, device, train_dataloader, optimizer, scheduler) print("epoch={}, 训练准确率={}, 损失={}, 精度={}, 召回率={}, F1={}".format( epoch, train_acc, train_loss, train_precision, train_recall, train_F1)) test_acc, test_precision, test_recall, test_F1 = evaluate( model, device, test_dataloader) print("epoch={}, 测试准确率={}, 精度={}, 召回率={}, F1={}".format( epoch, test_acc, test_precision, test_recall, test_F1)) records_Train["Epoch" + str(epoch)] = [ train_acc, train_loss, train_precision, train_recall, train_F1, ] records_Test["Epoch" + str(epoch)] = [ test_acc, test_precision, test_recall, test_F1, ] if test_F1 > best_score: best_score = test_F1 best_epoch = epoch # if epoch == 2: # print("###### Save Model ######") # model.save_pretrained("./save_model") # tokenizer.save_pretrained("./save_model") print("###### Finished ######") train_acc, train_loss, train_precision, train_recall, train_F1 = records_Train[ "Epoch" + str(best_epoch)] test_acc, test_precision, test_recall, test_F1 = records_Test[ "Epoch" + str(best_epoch)] print("best_epoch={}, 训练准确率={}, 损失={}, 精度={}, 召回率={}, F1={}".format( best_epoch, train_acc, train_loss, train_precision, train_recall, train_F1)) print("best_epoch={}, 测试准确率={}, 精度={}, 召回率={}, F1={}".format( best_epoch, test_acc, test_precision, test_recall, test_F1))
processor = preProcessor() tokenizer = BertTokenizer.from_pretrained(args_train["pre_train_model"], return_tensors='pt') train_dataset = load_and_cache_example(args_train, tokenizer, processor, 'simtrain') val_dataset = load_and_cache_example(args_train, tokenizer, processor, 'simdev') bert_config = BertConfig.from_pretrained(args_train["pre_train_model"]) bert_config.num_labels = len(processor.get_labels()) model_kwargs = {'config': bert_config, "from_tf": True} model = BertForSequenceClassification.from_pretrained( args_train["pre_train_model"], **model_kwargs) model = model.to(device) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=args_train["batch_size"]) val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=args_train["batch_size"]) del train_dataset, val_dataset gc.collect() train(args_train, train_dataloader, val_dataloader, model)
def __init__(self, weight="bert-base-uncased"): self.weight = weight self.tokenizer = BertTokenizer.from_pretrained(self.weight) self.model = BertForSequenceClassification.from_pretrained(self.weight)
def test(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args_test = { "data_dir": '/content/gdrive/My Drive/nlpqa3/data/', "load_path": '/content/gdrive/My Drive/nlpqa3/output/data/', #"vocab_file": 'bert-base-chinese-vocab.txt' #"model_config": 'pytorch_model.bin', #"model_path": 'config.json', "max_seq_length": 128, "batch_size": 32, "learning_rate": 6e-6, "epochs": 3, "device": device } tokenizer = BertTokenizer(os.path.join(args_test["data_dir"], 'bert-base-chinese-vocab.txt'), return_tensors='pt') processor = preProcessor() test_dataset = load_and_cache_example(args_test, tokenizer, processor, 'simtest') bert_config = BertConfig.from_pretrained( os.path.join(args_test["load_path"], 'config.json')) bert_config.num_labels = len(processor.get_labels()) test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args_test["batch_size"]) model_kwargs = {'config': bert_config, "from_tf": False} model = BertForSequenceClassification.from_pretrained( os.path.join(args_test["load_path"], 'pytorch_model.bin'), **model_kwargs) #model.load_state_dict(torch.load(os.path.join(args["load_path"], "model_path"))) model = model.to(device) del test_dataset gc.collect() total_loss = 0.0 total_sample = 0 # 样本数 all_real_labels = [] all_pred_labels = [] for batch in test_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss, logits = outputs[0], outputs[1] total_loss += loss.item() #total_loss += loss * batch[0].shape[0] # loss * 样本数 logits = logits.detach().cpu().numpy() label_ids = batch[3].to('cpu').numpy() total_sample += batch[0].shape[0] # 记录样本个数 #pred = logits.argmax(dim=-1).tolist() # 得到预测的label转为list pred = np.argmax(logits, axis=1).flatten() all_pred_labels.extend(pred) all_real_labels.extend(batch[3].view(-1).tolist()) loss = total_loss / total_sample question_acc, label_acc = calc_acc(all_real_labels, all_pred_labels) print("avg_loss", loss) print("question_acc", question_acc) print("label_acc", label_acc)
# Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) """# 4. Train Our Classification Model""" from transformers import BertForSequenceClassification, AdamW, BertConfig # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels = 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() # Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) print('The BERT model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:5]:
def main(): args = parse_args() use_cuda = torch.cuda.is_available() torch.backends.cudnn.benchmark = True random.seed(1337) torch.manual_seed(1337) if not use_cuda: print("warning, the experiments would take ages to run on cpu") hyperparams = vars(args) heuristic = get_heuristic(hyperparams['heuristic'], hyperparams['shuffle_prop']) model = BertForSequenceClassification.from_pretrained( pretrained_model_name_or_path=hyperparams["model"]) tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=hyperparams["model"]) # In this example we use tokenizer once only in the beginning since it would # make the whole process faster. However, it is also possible to input tokenizer # in trainer. active_set, test_set = get_datasets(hyperparams['initial_pool'], tokenizer) # change dropout layer to MCDropout model = patch_module(model) if use_cuda: model.cuda() init_weights = deepcopy(model.state_dict()) training_args = TrainingArguments( output_dir='/app/baal/results', # output directory num_train_epochs=hyperparams['learning_epoch'], # total # of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation weight_decay=0.01, # strength of weight decay logging_dir='/app/baal/logs', # directory for storing logs ) # We wrap the huggingface Trainer to create an Active Learning Trainer model = BaalTransformersTrainer(model=model, args=training_args, train_dataset=active_set, eval_dataset=test_set, tokenizer=None) logs = {} logs['epoch'] = 0 # In this case, nlp data is fast to process and we do NoT need to use a smaller batch_size active_loop = ActiveLearningLoop(active_set, model.predict_on_dataset, heuristic, hyperparams.get('n_data_to_label', 1), iterations=hyperparams['iterations']) for epoch in tqdm(range(args.epoch)): # we use the default setup of HuggingFace for training (ex: epoch=1). # The setup is adjustable when BaalHuggingFaceTrainer is defined. model.train() # Validation! eval_metrics = model.evaluate() # We reorder the unlabelled pool at the frequency of learning_epoch # This helps with speed while not changing the quality of uncertainty estimation. should_continue = active_loop.step() # We reset the model weights to relearn from the new trainset. model.load_state_dict(init_weights) model.lr_scheduler = None if not should_continue: break active_logs = {"epoch": epoch, "labeled_data": active_set._labelled, "Next Training set size": len(active_set)} logs = {**eval_metrics, **active_logs} print(logs)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', device) # path to all the files that will be used for inference path = f"./app/models/" # self.model_path = self.path + "traced_bert_epoch_1.pt" model_path = path + "custom_trained_model.bin" #tokenizer_path = "./app/models/bert-large-portuguese-cased" tokenizer_path = "neuralmind/bert-large-portuguese-cased" # self.model = torch.jit.load(self.model_path) model = BertForSequenceClassification.from_pretrained( tokenizer_path, num_labels=17, #local_files_only=True ) model.load_state_dict(torch.load(model_path, map_location=device)) tokenizer = BertTokenizer.from_pretrained( tokenizer_path, do_lower_case=True, torchscript=True, ) LABELS = {0: "A", 1: "B", 2: "C"} class ClassificationProcessor: def __init__(self):
# Find GPU device = torch.device("cuda") # BERT constants: BATCH_SIZE = 16 LEARNING_RATE = 2e-5 EPOCHS = 16 pretrained_weights = 'bert-base-uncased' # Set these paths to train BERT MODEL_PATH = '../models/bert_mnli/bert_model_mnli.pt' TRAIN_LOSS_PATH = '../models/bert_mnli/train_loss_per_batch.npy' VAL_LOSS_PATH = '../models/bert_mnli/val_loss_per_epoch.npy' # Initialize Model and Optimizer model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels=3) model.cuda() optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False) # Load data train_data = dl.SemEvalDataset("../data/preprocessed/bert_mnli_train.npy") val_data = dl.SemEvalDataset("../data/preprocessed/bert_mnli_val.npy") train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=True) val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=True)
'recall': recall } def send_inputs_to_device(inputs, device): return {key:tensor.to(device) for key, tensor in inputs.items()} # Creating Data Loader train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=16, collate_fn=DataCollatorWithPadding(tokenizer)) validation_loader = torch.utils.data.DataLoader(dataset['validation'], batch_size=32, collate_fn=DataCollatorWithPadding(tokenizer)) test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=32, collate_fn=DataCollatorWithPadding(tokenizer)) num_epochs = 4 num_warmup_steps = 5000 model = BertForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.train().to(device) optimizer = AdamW(model.parameters(), lr=5e-6) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_epochs*len(train_loader)) def predict(model, validation_loader, device): with torch.no_grad(): model.eval() preds = [] labels = [] validation_losses = [] for inputs in validation_loader:
def main(): # training settings def get_args(): parser = ArgumentParser(description='SST') parser.add_argument('--name', type=str, default='SST', metavar='S', help="Model name") parser.add_argument('--checkpoint', type=str, default='bert-base-uncased', metavar='S', help="e.g., bert-base-uncased, etc") parser.add_argument('--model', type=str, default='bert-base-uncased', metavar='S', help="e.g., bert-base-uncased, etc") parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 1)') parser.add_argument('--lr', type=float, default=1e-5, metavar='LR', help='learning rate (default: 1e-5)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--num-workers', type=int, default=0, metavar='N', help='number of CPU cores (default: 0)') parser.add_argument('--num-labels', type=int, default=2, metavar='N', help='number of labels to classify (default: 2)') parser.add_argument('--l2', type=float, default=0.01, metavar='LR', help='l2 regularization weight (default: 0.01)') parser.add_argument( '--max-seq-length', type=int, default=66, metavar='N', help='max sequence length for encoding (default: 66)') parser.add_argument('--warmup-proportion', type=int, default=0.1, metavar='N', help='Warmup proportion (default: 0.1)') parser.add_argument('--embed-batch-size', type=int, default=1, metavar='N', help='Embedding batch size emission; (default: 1)') args = parser.parse_args() return args args = get_args() # set seeds and determinism torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.cuda.amp.autocast(enabled=True) # set device device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # build ds train_ds = SST(type='train', transform=Tokenize_Transform(args, logger)) # build ds dev_ds = SST(type='dev', transform=Tokenize_Transform(args, logger)) # create training dataloader train_dataloader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # create embed dataloader train_embed_dataloader = DataLoader(train_ds, batch_size=args.embed_batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # create embed dataloader dev_embed_dataloader = DataLoader(dev_ds, batch_size=args.embed_batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # load the model model = BertForSequenceClassification.from_pretrained( args.checkpoint, num_labels=args.num_labels).to(device) # create gradient scaler for mixed precision scaler = GradScaler() # set optimizer param_optimizer = list(model.named_parameters()) # exclude these from regularization no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # give l2 regularization to any parameter that is not named after no_decay list # give no l2 regulariation to any bias parameter or layernorm bias/weight optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.l2 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # set optimizer optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, correct_bias=False, weight_decay=args.l2) num_train_optimization_steps = int( len(train_ds) / args.batch_size) * args.epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_training_steps=num_train_optimization_steps, num_warmup_steps=args.warmup_proportion * num_train_optimization_steps) # set epochs epochs = args.epochs # set location and make if necessary if args.checkpoint == 'bert-base-uncased': checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\' elif args.checkpoint == 'bert-large-uncased': checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\bert_large\\' os.makedirs(checkpoint_location, exist_ok=True) # train best_loss = np.inf for epoch in range(1, epochs + 1): train_log = train(model, train_dataloader, scaler, optimizer, scheduler, device, args) logs = dict(train_log) if logs['loss'] < best_loss: # torch save torch.save( model.state_dict(), checkpoint_location + args.name + '_epoch_{}.pt'.format(epoch)) best_loss = logs['loss'] show_info = f'\nEpoch: {epoch} - ' + "-".join( [f' {key}: {value:.4f} ' for key, value in logs.items()]) print(show_info) # now proceed to emit embeddings model = BertForSequenceClassification.from_pretrained( args.checkpoint, num_labels=args.num_labels, output_hidden_states=True).to(device) # load weights from 1 epoch model.load_state_dict( torch.load(checkpoint_location + args.name + '_epoch_1.pt')) # export embeddings emit_train_embeddings(train_embed_dataloader, train_ds, model, device, args) emit_dev_embeddings(dev_embed_dataloader, dev_ds, model, device, args)