def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/train.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Set the seed value all over the place to make this reproducible. seed_val = args.seed random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Choose device device = get_default_device() with open(args.train_data_path) as f: train_data = json.load(f) electra_base = "google/electra-base-discriminator" electra_large = "google/electra-large-discriminator" tokenizer = ElectraTokenizer.from_pretrained(electra_large, do_lower_case=True) labels = [] input_ids = [] token_type_ids = [] count = 0 for item in train_data: context = item["context"] question = item["question"] lab = item["label"] if lab == 3: # Remove unanswerable examples at training time continue labels.append(lab) three_inp_ids = [] three_tok_type_ids = [] three_answer_options = item["answers"][:3] for i, ans in enumerate(three_answer_options): combo = context + " [SEP] " + question + " " + ans inp_ids = tokenizer.encode(combo) tok_type_ids = [ 0 if i <= inp_ids.index(102) else 1 for i in range(len(inp_ids)) ] three_inp_ids.append(inp_ids) three_tok_type_ids.append(tok_type_ids) three_inp_ids = pad_sequences(three_inp_ids, maxlen=MAXLEN, dtype="long", value=0, truncating="post", padding="post") three_tok_type_ids = pad_sequences(three_tok_type_ids, maxlen=MAXLEN, dtype="long", value=0, truncating="post", padding="post") input_ids.append(three_inp_ids) token_type_ids.append(three_tok_type_ids) # Create attention masks attention_masks = [] for sen in input_ids: sen_attention_masks = [] for opt in sen: att_mask = [int(token_id > 0) for token_id in opt] sen_attention_masks.append(att_mask) attention_masks.append(sen_attention_masks) # Convert to torch tensors labels = torch.tensor(labels) labels = labels.long().to(device) input_ids = torch.tensor(input_ids) input_ids = input_ids.long().to(device) token_type_ids = torch.tensor(token_type_ids) token_type_ids = token_type_ids.long().to(device) attention_masks = torch.tensor(attention_masks) attention_masks = attention_masks.long().to(device) # Create the DataLoader for training set. train_data = TensorDataset(input_ids, token_type_ids, attention_masks, labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) model = ElectraForMultipleChoice.from_pretrained(electra_large).to(device) optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon # weight_decay = 0.01 ) loss_values = [] # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.n_epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps) for epoch in range(args.n_epochs): # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format( epoch + 1, args.n_epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 model.train() model.zero_grad() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_tok_typ_ids = batch[1].to(device) b_att_msks = batch[2].to(device) b_labs = batch[3].to(device) model.zero_grad() outputs = model(input_ids=b_input_ids, attention_mask=b_att_msks, token_type_ids=b_tok_typ_ids, labels=b_labs) loss = outputs[0] total_loss += loss.item() print(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() # Update the learning rate. scheduler.step() # model.zero_grad() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(format_time(time.time() - t0))) # Save the model to a file file_path = args.save_path + 'electra_QA_MC_seed' + str(args.seed) + '.pt' torch.save(model, file_path)
def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/train.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Set the seed value all over the place to make this reproducible. seed_val = args.seed random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Choose device device = get_default_device() with open(args.train_data_path + "middle.json") as f: middle_data = json.load(f) with open(args.train_data_path + "high.json") as f: high_data = json.load(f) train_data = middle_data + high_data electra_base = "google/electra-base-discriminator" electra_large = "google/electra-large-discriminator" tokenizer = ElectraTokenizer.from_pretrained(electra_large, do_lower_case=True) def asNum(x): if x == "A": return 0 if x == "B": return 1 if x == "C": return 2 if x == "D": return 3 labels = [] input_ids = [] token_type_ids = [] count = 0 for item in train_data: context = item["article"] questions = item["questions"] answers = item["answers"] options = item["options"] for qu_num in range(len(questions)): lab = asNum(answers[qu_num]) labels.append(lab) four_inp_ids = [] four_tok_type_ids = [] question = questions[qu_num] opts = options[qu_num] for opt in opts: combo = context + " [SEP] " + question + " " + opt inp_ids = tokenizer.encode(combo) if len(inp_ids) > 512: inp_ids = inp_ids[-512:] tok_type_ids = [ 0 if i <= inp_ids.index(102) else 1 for i in range(len(inp_ids)) ] four_inp_ids.append(inp_ids) four_tok_type_ids.append(tok_type_ids) four_inp_ids = pad_sequences(four_inp_ids, maxlen=MAXLEN, dtype="long", value=0, truncating="post", padding="post") four_tok_type_ids = pad_sequences(four_tok_type_ids, maxlen=MAXLEN, dtype="long", value=0, truncating="post", padding="post") input_ids.append(four_inp_ids) token_type_ids.append(four_tok_type_ids) # Create attention masks attention_masks = [] for sen in input_ids: sen_attention_masks = [] for opt in sen: att_mask = [int(token_id > 0) for token_id in opt] sen_attention_masks.append(att_mask) attention_masks.append(sen_attention_masks) # Convert to torch tensors labels = torch.tensor(labels) labels = labels.long().to(device) input_ids = torch.tensor(input_ids) input_ids = input_ids.long().to(device) token_type_ids = torch.tensor(token_type_ids) token_type_ids = token_type_ids.long().to(device) attention_masks = torch.tensor(attention_masks) attention_masks = attention_masks.long().to(device) # Create the DataLoader for training set. train_data = TensorDataset(input_ids, token_type_ids, attention_masks, labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) model = ElectraForMultipleChoice.from_pretrained(electra_large).to(device) optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon # weight_decay = 0.01 ) loss_values = [] # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.n_epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps) for epoch in range(args.n_epochs): # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format( epoch + 1, args.n_epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 model.train() model.zero_grad() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_tok_typ_ids = batch[1].to(device) b_att_msks = batch[2].to(device) b_labs = batch[3].to(device) model.zero_grad() outputs = model(input_ids=b_input_ids, attention_mask=b_att_msks, token_type_ids=b_tok_typ_ids, labels=b_labs) loss = outputs[0] total_loss += loss.item() print(loss.item()) optimizer.zero_grad() loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # if (step+1) % accumulation_steps == 0: # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # model.zero_grad() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(format_time(time.time() - t0))) # Save the model to a file file_path = args.save_path + 'electra_QA_MC_seed' + str(args.seed) + '.pt' torch.save(model, file_path)