def train_epoch(model, training_data, optimizer, device, epoch, tb=None, log_interval=100): model.train() total_loss = 0 n_char_total = 0 n_char_correct = 0 for batch_idx, batch in enumerate( tqdm(training_data, mininterval=2, leave=False)): batch_qs, batch_qs_pos, batch_as, batch_as_pos = map( lambda x: x.to(device), batch) gold_as = batch_as[:, 1:] optimizer.zero_grad() pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos) loss, n_correct = compute_performance(pred_as, gold_as, smoothing=True) loss.backward() # update parameters optimizer.step() # note keeping total_loss += loss.item() non_pad_mask = gold_as.ne(Constants.PAD) n_char = non_pad_mask.sum().item() n_char_total += n_char n_char_correct += n_correct if tb is not None and batch_idx % log_interval == 0: tb.add_scalars( { "loss_per_char": total_loss / n_char_total, "accuracy": n_char_correct / n_char_total, }, group="train", sub_group="batch", global_step=epoch * len(training_data) + batch_idx) loss_per_char = total_loss / n_char_total accuracy = n_char_correct / n_char_total if tb is not None: tb.add_scalars({ "loss_per_char": loss_per_char, "accuracy": accuracy, }, group="train", sub_group="epoch", global_step=epoch) return loss_per_char, accuracy
def eval_epoch(model, validation_data, device, graph_pool, epoch, tb=None, log_interval=100): model.eval() total_loss = 0 n_char_total = 0 n_char_correct = 0 with torch.no_grad(): for batch_idx, batch in enumerate( tqdm(validation_data, mininterval=2, leave=False)): # prepare data #batch_qs, batch_as = batch #gold_as = torch.tensor(batch_as[:, 1:]).to(device) #g = graph_pool(batch_qs, batch_as, device=device) gold_as, g = batch gold_as = gold_as.to(device) g = graph_to_device(g, device) # forward pred_as = model(g) loss, n_correct = compute_performance(pred_as, gold_as, smoothing=False) # note keeping total_loss += loss.item() non_pad_mask = gold_as.ne(Constants.PAD) n_char = non_pad_mask.sum().item() n_char_total += n_char n_char_correct += n_correct loss_per_char = total_loss / n_char_total accuracy = n_char_correct / n_char_total if tb is not None: tb.add_scalars({ "loss_per_char": loss_per_char, "accuracy": accuracy, }, group="eval", sub_group="epoch", global_step=epoch) return loss_per_char, accuracy
def inference_epoch(model, data, device, epoch, group, tb=None, log_interval=100): model.eval() total_loss = 0 n_char_total = 0 n_char_correct = 0 with torch.no_grad(): for batch_idx, batch in enumerate( tqdm(data, mininterval=2, leave=False)): # prepare data batch_qs, batch_qs_pos, batch_as, batch_as_pos = map( lambda x: x.to(device), batch) gold_as = batch_as[:, 1:] # forward pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos) loss, n_correct = compute_performance(pred_as, gold_as, smoothing=False) # note keeping total_loss += loss.item() non_pad_mask = gold_as.ne(Constants.PAD) n_char = non_pad_mask.sum().item() n_char_total += n_char n_char_correct += n_correct loss_per_char = total_loss / n_char_total accuracy = n_char_correct / n_char_total if tb is not None: tb.add_scalars( { "loss_per_char": loss_per_char, "accuracy": accuracy }, group=group, sub_group="epoch", global_step=epoch, ) return loss_per_char, accuracy
def train_epoch( model, name, training_data, optimizer, device, epoch, tb=None, log_interval=100, max_batches=None, run_batch_count=0, start_batch=0, total_loss=0, n_char_total=0, n_char_correct=0, lr=None, warmup_lr=None, warmup_interval=None, smoothing=False, ): training_iter = iter(training_data) if start_batch > 0: last_question = np_encode_string( training_data.dataset.__getitem__(-1)["q"]) print(f"Final question before checkpoint was {last_question}") model.train() # interrupted_batch = None done = False loss_per_char = 0 accuracy = 0 for batch_idx, batch in enumerate(training_iter, start=start_batch): if utils.is_preempted(): print("Exiting...") sys.exit(0) if warmup_interval is not None and batch_idx == warmup_interval: print( f"End of warmup. Swapping learning rates from {warmup_lr} to {lr}" ) for param_group in optimizer.param_groups: warmup_lr = lr param_group["lr"] = lr batch_qs, batch_qs_pos, batch_as, batch_as_pos = map( lambda x: x.to(device), batch) gold_as = batch_as[:, 1:] optimizer.zero_grad() pred_as = model(batch_qs, batch_qs_pos, batch_as, batch_as_pos) loss, n_correct = compute_performance(pred_as, gold_as, smoothing=smoothing) loss.backward() # Clip gradients, paper uses 0.1 clip_grad_value_(model.parameters(), 0.1) # update parameters optimizer.step() # note keeping total_loss += loss.item() non_pad_mask = gold_as.ne(Constants.PAD) n_char = non_pad_mask.sum().item() n_char_total += n_char n_char = n_char if n_char > 1 else 1 batch_loss = loss / n_char loss_per_char = total_loss / n_char_total n_char_correct += n_correct batch_acc = n_correct / n_char accuracy = n_char_correct / n_char_total print( f"Batch: {batch_idx}. Acc: {accuracy:.6f}. Loss: {loss_per_char:.6f}. Batch_acc: {batch_acc:.6f}. Batch_loss: {batch_loss:.6f} " ) # TODO: automatically trim the TB logs that go beyond the preempted checkpoint if tb is not None and batch_idx % log_interval == 0: tb.add_scalars( { "loss_per_char": loss_per_char, "accuracy": accuracy, "batch_loss": batch_loss, "batch_acc": batch_acc, }, group="train", sub_group="batch", global_step=run_batch_count, ) run_batch_count += 1 if max_batches is not None and run_batch_count == max_batches: print( f"Reached {run_batch_count} batches on max_batches of {max_batches}. Breaking from epoch." ) # interrupted_batch = batch_idx done = True break if batch_idx % 251 == 0 and batch_idx != 0: print( f"Checkpointing on batch: {batch_idx}. Accuracy: {accuracy}. Loss per char: {loss_per_char}. Time: {time.time()}" ) print(f"Last question is {batch_qs[-1]}") state = build_checkpoint( name=name, model=model, optimizer=optimizer, acc=accuracy, loss=loss_per_char, epoch=epoch, run_batches=run_batch_count, start_batch=batch_idx + 1, total_loss=total_loss, n_char_total=n_char_total, n_char_correct=n_char_correct, lr=warmup_lr, ) save_checkpoint(state=state, name=f"{name}_latest_checkpoint", path="./checkpoints") # if utils.is_preempted(): # print( # f"Preemption at end of Epoch batch: {batch_idx} and new Run batch: {run_batch_count}. Breaking from epoch." # ) # interrupted_batch = batch_idx # break if tb is not None and not utils.is_preempted(): tb.add_scalars( { "loss_per_char": loss_per_char, "accuracy": accuracy }, group="train", sub_group="epoch", global_step=epoch, ) return loss_per_char, accuracy, run_batch_count, done