def train(model, training_data, validation_data, epochs, batch_size, batches_per_print=100, evaluate_per=1, padding_index=-100, checkpoint_path=None, custom_schedule=False, custom_loss=False): """ model: MusicTransformer module training_data: List of encoded music sequences validation_data: List of encoded music sequences epochs: Number of iterations over training batches batch_size: _ batches_per_print: How often to print training loss evaluate_per: calculate validation loss after this many epochs padding_index: ignore this sequence token in loss calculation checkpoint_path: (str or None) If defined, save the model's state dict to this file path after validation custom_schedule: (bool) If True, use a learning rate scheduler with a warmup ramp custom_loss: (bool) If True, set loss function as Cross Entropy with label smoothing """ writer = SummaryWriter('drive/MyDrive/UETK62/runs/transformers1') training_start_time = time.time() model.train() optimizer = torch.optim.Adam(model.parameters()) if custom_schedule: optimizer = TFSchedule(optimizer, model.d_model) if custom_loss: loss_function = smooth_cross_entropy else: loss_function = nn.CrossEntropyLoss(ignore_index=padding_index) accuracy = Accuracy() if torch.cuda.is_available(): model.cuda() print("GPU is available") else: print("GPU not available, CPU used") training_losses = [] validation_losses = [] # pad to length of longest sequence max_length = max((len(L) for L in (training_data + validation_data))) - 1 #test save ckpt print("test save ckpt") if checkpoint_path is not None: print("checkpoint: " + checkpoint_path) try: torch.save(model.state_dict(), checkpoint_path + f"_e") print("Test Checkpoint saved!") except: print("Error: test checkpoint could not be saved...") for e in tqdm(range(epochs)): batch_start_time = time.time() batch_num = 1 averaged_loss = 0 averaged_accuracy = 0 training_batches = prepare_batches(training_data, batch_size) # returning batches of a given size for idx, batch in enumerate(training_batches): # skip batches that are undersized if len(batch[0]) != batch_size: continue x, y, x_mask = batch_to_tensors(batch, model.n_tokens, max_length) y_hat = model(x, x_mask).transpose(1, 2) loss = loss_function(y_hat, y) model.zero_grad() loss.backward() optimizer.step() training_loss = loss.item() training_losses.append(training_loss) writer.add_scalar("loss/train_loss", training_loss, e * len([training_batches]) + idx) averaged_loss += training_loss averaged_accuracy += accuracy(y_hat, y, x_mask) if batch_num % batches_per_print == 0: print(f"batch {batch_num}, loss: {averaged_loss / batches_per_print : .2f}") print(f"accuracy: {averaged_accuracy / batches_per_print : .2f}") writer.add_scalar("accuracy/train_accuracy", averaged_accuracy / batches_per_print, e * len([training_batches]) + idx) averaged_loss = 0 averaged_accuracy = 0 batch_num += 1 print(f"epoch: {e + 1}/{epochs} | time: {(time.time() - batch_start_time) / 60:,.0f}m") shuffle(training_data) if (e + 1) % evaluate_per == 0: model.eval() validation_batches = prepare_batches(validation_data, batch_size) # get loss per batch val_loss = 0 n_batches = 0 val_accuracy = 0 for batch in validation_batches: if len(batch[0]) != batch_size: continue x, y, x_mask = batch_to_tensors(batch, model.n_tokens, max_length) y_hat = model(x, x_mask).transpose(1, 2) loss = loss_function(y_hat, y) val_loss += loss.item() val_accuracy += accuracy(y_hat, y, x_mask) n_batches += 1 if checkpoint_path is not None: print("checkpoint: " + checkpoint_path) try: torch.save(model.state_dict(), checkpoint_path + f"_e{e}") print("Checkpoint saved!") except: print("Error: checkpoint could not be saved...") model.train() # average out validation loss val_accuracy = (val_accuracy / n_batches) val_loss = (val_loss / n_batches) validation_losses.append(val_loss) writer.add_scalar("loss/val_loss", val_loss, e) writer.add_scalar("accuracy/val_accuracy", val_accuracy, e) print(f"validation loss: {val_loss:.2f}") print(f"validation accuracy: {val_accuracy:.2f}") shuffle(validation_data) return training_losses
def train(model, training_data, validation_data, epochs, batch_size, batches_per_print=100, evaluate_per=1, padding_index=-100, checkpoint_path=None, custom_schedule=False, custom_loss=False): """ Training loop function. Args: model: MusicTransformer module training_data: List of encoded music sequences validation_data: List of encoded music sequences epochs: Number of iterations over training batches batch_size: _ batches_per_print: How often to print training loss evaluate_per: calculate validation loss after this many epochs padding_index: ignore this sequence token in loss calculation checkpoint_path: (str or None) If defined, save the model's state dict to this file path after validation custom_schedule: (bool) If True, use a learning rate scheduler with a warmup ramp custom_loss: (bool) If True, set loss function as Cross Entropy with label smoothing """ training_start_time = time.time() model.train() optimizer = torch.optim.Adam(model.parameters()) if custom_schedule: optimizer = TFSchedule(optimizer, model.d_model) if custom_loss: loss_function = smooth_cross_entropy else: loss_function = nn.CrossEntropyLoss(ignore_index=padding_index) accuracy = Accuracy() if torch.cuda.is_available(): model.cuda() print("GPU is available") else: print("GPU not available, CPU used") training_losses = [] validation_losses = [] #pad to length of longest sequence #minus one because input/target sequences are shifted by one char max_length = max((len(L) for L in (training_data + validation_data))) - 1 for e in range(epochs): batch_start_time = time.time() batch_num = 1 averaged_loss = 0 averaged_accuracy = 0 training_batches = prepare_batches( training_data, batch_size) #returning batches of a given size for batch in training_batches: #skip batches that are undersized if len(batch[0]) != batch_size: continue x, y, x_mask = batch_to_tensors(batch, model.n_tokens, max_length) y_hat = model(x, x_mask).transpose(1, 2) #shape: (batch_size, n_tokens, seq_length) loss = loss_function(y_hat, y) #detach hidden state from the computation graph; we don't need its gradient #clear old gradients from previous step model.zero_grad() #compute derivative of loss w/r/t parameters loss.backward() #optimizer takes a step based on gradient optimizer.step() training_loss = loss.item() training_losses.append(training_loss) #take average over subset of batch? averaged_loss += training_loss averaged_accuracy += accuracy(y_hat, y, x_mask) if batch_num % batches_per_print == 0: print( f"batch {batch_num}, loss: {averaged_loss / batches_per_print : .2f}" ) print( f"accuracy: {averaged_accuracy / batches_per_print : .2f}") averaged_loss = 0 averaged_accuracy = 0 batch_num += 1 print( f"epoch: {e+1}/{epochs} | time: {(time.time() - batch_start_time) / 60:,.0f}m" ) shuffle(training_data) if (e + 1) % evaluate_per == 0: #deactivate backprop for evaluation model.eval() validation_batches = prepare_batches(validation_data, batch_size) #get loss per batch val_loss = 0 n_batches = 0 val_accuracy = 0 for batch in validation_batches: if len(batch[0]) != batch_size: continue x, y, x_mask = batch_to_tensors(batch, model.n_tokens, max_length) y_hat = model(x, x_mask).transpose(1, 2) loss = loss_function(y_hat, y) val_loss += loss.item() val_accuracy += accuracy(y_hat, y, x_mask) n_batches += 1 if checkpoint_path is not None: try: torch.save(model.state_dict(), checkpoint_path + f"_e{e}") print("Checkpoint saved!") except: print("Error: checkpoint could not be saved...") model.train() #average out validation loss val_accuracy = (val_accuracy / n_batches) val_loss = (val_loss / n_batches) validation_losses.append(val_loss) print(f"validation loss: {val_loss:.2f}") print(f"validation accuracy: {val_accuracy:.2f}") shuffle(validation_data) return training_losses
def train(model, training_data, validation_data, epochs, lr, evaluate_per, batch_size): model.train() #short hand to begin tracking the gradient optimizer = torch.optim.Adam( model.parameters(), lr=lr) #i don't know...? gradient descent equiv #this is equivalent to a log softmax activation layer + negative log likelihood loss_function = nn.CrossEntropyLoss() if torch.cuda.is_available(): #device = torch.device("cuda") model.cuda() print("GPU is available") else: #device = torch.device("cpu") print("GPU not available, CPU used") for e in range(epochs): start_time = time.time() training_batches = prepare_batches( training_data, batch_size) #returning batches of a given size # input and target sequences, latter one time step ahead hx = None #this is the hidden state, None means: initializes to zeros # hidden state is the "internal representation" of the sequence for input_sequences, target_sequences in training_batches: #skip batches that are undersized if len(input_sequences) != batch_size: continue y_hat, hx = model(input_sequences, hx) y = get_target_tensor(target_sequences) loss = loss_function(y_hat.flatten(0, 1), y) #don't want to be backpropagating through every timestep, so hidden state #is detached from the graph hx = tuple(h.detach() for h in hx) #clear old gradients from previous step model.zero_grad() #compute derivative of loss w/r/t parameters loss.backward() #consider clipping grad norm #optimizer takes a step based on gradient optimizer.step() training_loss = loss.item() print(f"epoch: {e+1}/{epochs} | time: {time.time() - start_time:.0f}s") print(f"training loss: {training_loss :.2f}") shuffle(training_data) if (e + 1) % evaluate_per == 0: #deactivate backprop for evaluation model.eval() validation_batches = prepare_batches(validation_data, batch_size) #get loss per batch val_loss = 0 n_batches = 0 for input_sequences, target_sequences in validation_batches: if len(input_sequences) != batch_size: continue sequence_lengths = [ len(sequence) for sequence in input_sequences ] y_hat, hx = model(input_sequences, hx) y = get_target_tensor(target_sequences) loss = loss_function(y_hat.flatten(0, 1), y) val_loss += loss.item() n_batches += 1 model.train() print(f"validation loss: {val_loss / n_batches:.2f}") shuffle(validation_data)