def evaluate( model: TEDD1104, X: torch.tensor, golds: torch.tensor, device: torch.device, batch_size: int, ) -> float: """ Given a set of input examples and the golds for these examples evaluates the model accuracy Input: - model: TEDD1104 model to evaluate - X: input examples [num_examples, sequence_size, 3, H, W] - golds: golds for the input examples [num_examples] - device: string, use cuda or cpu -batch_size: integer batch size Output: - Accuracy: float """ model.eval() correct = 0 for X_batch, y_batch in nn_batchs(X, golds, batch_size): predictions: np.ndarray = model.predict(X_batch.to(device)).cpu().numpy() correct += np.sum(predictions == y_batch) return correct / len(golds)
def evaluate( model: TEDD1104, data_loader: DataLoader, device: torch.device, fp16: bool, ) -> float: """ Given a set of input examples and the golds for these examples evaluates the model accuracy Input: - model: TEDD1104 model to evaluate - data_loader: torch.utils.data.DataLoader with the examples to evaluate - device: string, use cuda or cpu -batch_size: integer batch size Output: - Accuracy: float """ model.eval() correct = 0 total = 0 for batch in tqdm(data_loader, desc="Evaluating model"): x = torch.flatten( torch.stack( ( batch["image1"], batch["image2"], batch["image3"], batch["image4"], batch["image5"], ), dim=1, ), start_dim=0, end_dim=1, ).to(device) y = batch["y"] if fp16: with autocast(): predictions: np.ndarray = model.predict(x).cpu() else: predictions: np.ndarray = model.predict(x).cpu() correct += (predictions == y).sum().numpy() total += len(predictions) return correct / total
def train_new_model( train_dir="Data\\GTAV-AI\\data-v2\\train\\", dev_dir="Data\\GTAV-AI\\data-v2\\dev\\", test_dir="Data\\GTAV-AI\\data-v2\\test\\", output_dir="Data\\models\\", batch_size=10, num_epoch=20, optimizer_name="SGD", resnet: int = 18, pretrained_resnet: bool = True, sequence_size: int = 5, embedded_size: int = 256, hidden_size: int = 128, num_layers_lstm: int = 1, bidirectional_lstm: bool = False, layers_out: List[int] = None, dropout_cnn: float = 0.1, dropout_cnn_out: float = 0.1, dropout_lstm: float = 0.1, dropout_lstm_out: float = 0.1, hide_map_prob: float = 0.0, num_load_files_training: int = 5, fp16=True, apex_opt_level="O2", save_checkpoints=True, save_every: int = 100, save_best=True, ): """ Train a new model Input: - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - num_epochs: Number of epochs to do - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - resnet: resnet module to use [18,34,50,101,152] - pretrained_resnet: Load pretrained resnet weights - sequence_size: Length of each series of features - embedded_size: Size of the feature vectors - hidden_size: LSTM hidden size - num_layers_lstm: number of layers in the LSTM - bidirectional_lstm: forward or bidirectional LSTM - layers_out: list of integer, for each integer i a linear layer with i neurons will be added. - dropout_cnn: dropout probability for the CNN layers - dropout_cnn_out: dropout probability for the cnn features (output layer) - dropout_lstm: dropout probability for the LSTM - dropout_lstm_out: dropout probability for the LSTM features (output layer) - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: """ if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) print("Loading new model") model: TEDD1104 = TEDD1104( resnet=resnet, pretrained_resnet=pretrained_resnet, sequence_size=sequence_size, embedded_size=embedded_size, hidden_size=hidden_size, num_layers_lstm=num_layers_lstm, bidirectional_lstm=bidirectional_lstm, layers_out=layers_out, dropout_cnn=dropout_cnn, dropout_cnn_out=dropout_cnn_out, dropout_lstm=dropout_lstm, dropout_lstm_out=dropout_lstm_out, ).to(device) if optimizer_name == "SGD": optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) elif optimizer_name == "Adam": optimizer = optim.Adam(model.parameters(), lr=0.001) else: raise ValueError( f"Optimizer {optimizer_name} not implemented. Available optimizers: SGD, Adam" ) if fp16: model, optimizer = amp.initialize( model, optimizer, opt_level=apex_opt_level, keep_batchnorm_fp32=True, loss_scale="dynamic", ) max_acc = train( model=model, optimizer_name=optimizer_name, optimizer=optimizer, train_dir=train_dir, dev_dir=dev_dir, test_dir=test_dir, output_dir=output_dir, batch_size=batch_size, initial_epoch=0, num_epoch=num_epoch, max_acc=0.0, hide_map_prob=hide_map_prob, num_load_files_training=num_load_files_training, fp16=fp16, amp_opt_level=apex_opt_level if fp16 else None, save_checkpoints=save_checkpoints, save_every=save_every, save_best=save_best, ) print(f"Training finished, max accuracy in the development set {max_acc}")
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, save_every: int = 100, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) acc_dev: float = 0.0 total_training_exampels: int = 0 printTrace("Training...") for epoch in range(num_epoch): iteration_no = 0 num_used_files: int = 0 files: List[str] = glob.glob(os.path.join(train_dir, "*.npz")) random.shuffle(files) # Get files in batches, all files will be loaded and data will be shuffled for paths in batch(files, num_load_files_training): iteration_no += 1 num_used_files += num_load_files_training model.train() start_time: float = time.time() X, y = load_and_shuffle_datasets(paths=paths, fp=16 if fp16 else 32, hide_map_prob=hide_map_prob) total_training_exampels += len(y) running_loss = 0.0 num_batchs = 0 for X_bacth, y_batch in nn_batchs(X, y, batch_size): X_bacth, y_batch = ( torch.from_numpy(X_bacth).to(device), torch.from_numpy(y_batch).long().to(device), ) optimizer.zero_grad() outputs = model.forward(X_bacth) loss = criterion(outputs, y_batch) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() running_loss += loss.item() num_batchs += 1 start_time_eval: float = time.time() # Print Statistics if len(X) > 0 and len(y) > 0: acc_train = evaluate( model=model, X=torch.from_numpy(X), golds=y, device=device, batch_size=batch_size, ) else: acc_train = -1.0 acc_dev = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. " f"{num_used_files} of {len(files)} files. " f"Total examples used for training {total_training_exampels}. " f"Iteration time: {time.time() - start_time} secs. Eval time: {time.time() - start_time_eval} secs." ) printTrace( f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. Acc training set: {acc_train}. " f"Acc dev set: {acc_dev}. Acc test set: {acc_test}") if acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {max_acc}. Saving model...") save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) if save_checkpoints and iteration_no % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) return max_acc
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, eval_every: int = 5, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ writer: SummaryWriter = SummaryWriter() if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) total_training_exampels: int = 0 model.zero_grad() printTrace("Training...") for epoch in range(num_epoch): step_no: int = 0 iteration_no: int = 0 num_used_files: int = 0 data_loader = DataLoaderTEDD( dataset_dir=train_dir, nfiles2load=num_load_files_training, hide_map_prob=hide_map_prob, dropout_images_prob=dropout_images_prob, fp=16 if fp16 else 32, ) data = data_loader.get_next() # Get files in batches, all files will be loaded and data will be shuffled while data: X, y = data model.train() start_time: float = time.time() total_training_exampels += len(y) running_loss: float = 0.0 num_batchs: int = 0 acc_dev: float = 0.0 for X_bacth, y_batch in nn_batchs(X, y, batch_size): X_bacth, y_batch = ( torch.from_numpy(X_bacth).to(device), torch.from_numpy(y_batch).long().to(device), ) #print(X_bacth) outputs = model.forward(X_bacth) #print(outputs.size()) #print(y_batch) loss = criterion(outputs, y_batch) / accumulation_steps running_loss += loss.item() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (step_no + 1) % accumulation_steps or ( num_used_files + 1 > len(data_loader) - num_load_files_training and num_batchs == math.ceil(len(y) / batch_size) - 1 ): # If we are in the last bach of the epoch we also want to perform gradient descent optimizer.step() model.zero_grad() num_batchs += 1 step_no += 1 num_used_files += num_load_files_training # Print Statistics printTrace( f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. " f"{num_used_files} of {len(data_loader)} files. " f"Total examples used for training {total_training_exampels}. " f"Iteration time: {round(time.time() - start_time,2)} secs.") printTrace( f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss / num_batchs, iteration_no) scheduler.step(running_loss / num_batchs) if (iteration_no + 1) % eval_every == 0: start_time_eval: float = time.time() if len(X) > 0 and len(y) > 0: acc_train: float = evaluate( model=model, X=torch.from_numpy(X), golds=y, device=device, batch_size=batch_size, ) else: acc_train = -1.0 acc_dev: float = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test: float = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"Acc training set: {round(acc_train,2)}. " f"Acc dev set: {round(acc_dev,2)}. " f"Acc test set: {round(acc_test,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs." ) if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {round(max_acc,2)}. Saving model..." ) save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) if acc_train > -1: writer.add_scalar("Accuracy/train", acc_train, iteration_no) writer.add_scalar("Accuracy/dev", acc_dev, iteration_no) writer.add_scalar("Accuracy/test", acc_test, iteration_no) if save_checkpoints and (iteration_no + 1) % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) iteration_no += 1 data = data_loader.get_next() data_loader.close() return max_acc
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, eval_every: int = 5, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ writer: SummaryWriter = SummaryWriter() if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) total_training_exampels: int = 0 model.zero_grad() trainLoader = DataLoader(dataset=PickleDataset(train_dir), batch_size=batch_size, shuffle= False, num_workers=8) printTrace("Training...") iteration_no: int = 0 for epoch in range(num_epoch): #step_no: int = 0 #num_used_files: int = 0 print('EpochNum: ' + str(epoch)) model.train() start_time: float = time.time() running_loss: float = 0.0 acc_dev: float = 0.0 for num_batchs, inputs in enumerate(trainLoader): X_bacth = torch.reshape(inputs[0], (inputs[0].shape[0] * 5, 3, inputs[0].shape[2], inputs[0].shape[3])).to(device) y_batch = torch.reshape(inputs[1], (inputs[0].shape[0],)).long().to(device) #print(X_bacth) #X_bacth, y_batch = ( # torch.from_numpy(batch_data).to(device), # torch.from_numpy(inputs[1]).long().to(device), #) outputs = model.forward(X_bacth) #print(outputs.size()) #print(y_batch) loss = criterion(outputs, y_batch) / accumulation_steps running_loss += loss.item() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() model.zero_grad() #scheduler.step(running_loss) # Print Statistics printTrace( f"Loss: {running_loss/num_batchs}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss, iteration_no) if (iteration_no + 1) % eval_every == 0: start_time_eval: float = time.time() acc_dev: float = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test: float = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"Acc dev set: {round(acc_dev,2)}. " f"Acc test set: {round(acc_test,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs." ) if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {round(max_acc,2)}. Saving model..." ) save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) writer.add_scalar("Accuracy/dev", acc_dev, iteration_no) writer.add_scalar("Accuracy/test", acc_test, iteration_no) if save_checkpoints and (iteration_no + 1) % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) iteration_no += 1 return max_acc
def train_new_model( train_dir="Data\\GTAV-AI\\data-v2\\train\\", dev_dir="Data\\GTAV-AI\\data-v2\\dev\\", test_dir="Data\\GTAV-AI\\data-v2\\test\\", output_dir="Data\\models\\", batch_size=10, accumulation_steps: int = 1, num_epoch=20, optimizer_name="SGD", learning_rate: float = 0.01, scheduler_patience: int = 10000, resnet: int = 18, pretrained_resnet: bool = True, sequence_size: int = 5, embedded_size: int = 256, hidden_size: int = 128, num_layers_lstm: int = 1, bidirectional_lstm: bool = False, layers_out: List[int] = None, dropout_cnn: float = 0.1, dropout_cnn_out: float = 0.1, dropout_lstm: float = 0.1, dropout_lstm_out: float = 0.1, hide_map_prob: float = 0.0, dropout_images_prob=None, fp16=True, save_checkpoints=True, save_every: int = 20, save_best=True, ): """ Train a new model Input: - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - num_epochs: Number of epochs to do - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - resnet: resnet module to use [18,34,50,101,152] - pretrained_resnet: Load pretrained resnet weights - sequence_size: Length of each series of features - embedded_size: Size of the feature vectors - hidden_size: LSTM hidden size - num_layers_lstm: number of layers in the LSTM - bidirectional_lstm: forward or bidirectional LSTM - layers_out: list of integer, for each integer i a linear layer with i neurons will be added. - dropout_cnn: dropout probability for the CNN layers - dropout_cnn_out: dropout probability for the cnn features (output layer) - dropout_lstm: dropout probability for the LSTM - dropout_lstm_out: dropout probability for the LSTM features (output layer) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: """ print("Loading new model") model: TEDD1104 = TEDD1104( resnet=resnet, pretrained_resnet=pretrained_resnet, sequence_size=sequence_size, embedded_size=embedded_size, hidden_size=hidden_size, num_layers_lstm=num_layers_lstm, bidirectional_lstm=bidirectional_lstm, layers_out=layers_out, dropout_cnn=dropout_cnn, dropout_cnn_out=dropout_cnn_out, dropout_lstm=dropout_lstm, dropout_lstm_out=dropout_lstm_out, ).to(device) if optimizer_name == "SGD": optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True) elif optimizer_name == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01) else: raise ValueError( f"Optimizer {optimizer_name} not implemented. Available optimizers: SGD, Adam" ) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, verbose=True, patience=scheduler_patience, factor=0.5) max_acc = train( model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, train_dir=train_dir, dev_dir=dev_dir, test_dir=test_dir, output_dir=output_dir, batch_size=batch_size, accumulation_steps=accumulation_steps, initial_epoch=0, num_epoch=num_epoch, running_loss=0.0, total_batches=0, total_training_examples=0, max_acc=0.0, hide_map_prob=hide_map_prob, dropout_images_prob=dropout_images_prob, fp16=fp16, scaler=GradScaler() if fp16 else None, save_checkpoints=save_checkpoints, save_every=save_every, save_best=save_best, ) print(f"Training finished, max accuracy in the development set {max_acc}")
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, scaler: GradScaler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, running_loss: float, total_batches: int, total_training_examples: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], fp16: bool = True, save_checkpoints: bool = True, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ if not os.path.exists(output_dir): print(f"{output_dir} does not exits. We will create it.") os.makedirs(output_dir) writer: SummaryWriter = SummaryWriter() criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss().to(device) model.zero_grad() print_message("Training...") for epoch in range(num_epoch): acc_dev: float = 0.0 num_batches: int = 0 step_no: int = 0 data_loader_train = DataLoader( Tedd1104Dataset( dataset_dir=train_dir, hide_map_prob=hide_map_prob, dropout_images_prob=dropout_images_prob, ), batch_size=batch_size, shuffle=True, num_workers=os.cpu_count(), pin_memory=True, ) start_time: float = time.time() step_start_time: float = time.time() dataloader_delay: float = 0 model.train() for batch in data_loader_train: x = torch.flatten( torch.stack( ( batch["image1"], batch["image2"], batch["image3"], batch["image4"], batch["image5"], ), dim=1, ), start_dim=0, end_dim=1, ).to(device) y = batch["y"].to(device) dataloader_delay += time.time() - step_start_time total_training_examples += len(y) if fp16: with autocast(): outputs = model.forward(x) loss = criterion(outputs, y) loss = loss / accumulation_steps running_loss += loss.item() scaler.scale(loss).backward() else: outputs = model.forward(x) loss = criterion(outputs, y) / accumulation_steps running_loss += loss.item() loss.backward() if ((step_no + 1) % accumulation_steps == 0) or ( step_no + 1 >= len(data_loader_train) ): # If we are in the last bach of the epoch we also want to perform gradient descent if fp16: # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() optimizer.zero_grad() else: # Gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() optimizer.zero_grad() total_batches += 1 num_batches += 1 scheduler.step(running_loss / total_batches) batch_time = round(time.time() - start_time, 2) est: float = batch_time * (math.ceil( len(data_loader_train) / accumulation_steps) - num_batches) print_message( f"EPOCH: {initial_epoch + epoch}. " f"{num_batches} of {math.ceil(len(data_loader_train)/accumulation_steps)} batches. " f"Total examples used for training {total_training_examples}. " f"Iteration time: {batch_time} secs. " f"Data Loading bottleneck: {round(dataloader_delay, 2)} secs. " f"Epoch estimated time: " f"{str(datetime.timedelta(seconds=est)).split('.')[0]}") print_message( f"Loss: {running_loss / total_batches}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss / total_batches, total_batches) if save_checkpoints and (total_batches + 1) % save_every == 0: print_message("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, running_loss=running_loss, total_batches=total_batches, total_training_examples=total_training_examples, acc_dev=max_acc, epoch=initial_epoch + epoch, fp16=fp16, scaler=None if not fp16 else scaler, ) dataloader_delay: float = 0 start_time: float = time.time() step_no += 1 step_start_time = time.time() del data_loader_train print_message("Dev set evaluation...") start_time_eval: float = time.time() data_loader_dev = DataLoader( Tedd1104Dataset( dataset_dir=dev_dir, hide_map_prob=0, dropout_images_prob=[0, 0, 0, 0, 0], ), batch_size=batch_size // 2, # Use smaller batch size to prevent OOM issues shuffle=False, num_workers=os.cpu_count() // 2, # Use less cores to save RAM pin_memory=True, ) acc_dev: float = evaluate( model=model, data_loader=data_loader_dev, device=device, fp16=fp16, ) del data_loader_dev print_message("Test set evaluation...") data_loader_test = DataLoader( Tedd1104Dataset( dataset_dir=test_dir, hide_map_prob=0, dropout_images_prob=[0, 0, 0, 0, 0], ), batch_size=batch_size // 2, # Use smaller batch size to prevent OOM issues shuffle=False, num_workers=os.cpu_count() // 2, # Use less cores to save RAM pin_memory=True, ) acc_test: float = evaluate( model=model, data_loader=data_loader_test, device=device, fp16=fp16, ) del data_loader_test print_message( f"Acc dev set: {round(acc_dev*100,2)}. " f"Acc test set: {round(acc_test*100,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs.") if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev print_message( f"New max acc in dev set {round(max_acc, 2)}. Saving model...") save_model( model=model, save_dir=output_dir, fp16=fp16, ) writer.add_scalar("Accuracy/dev", acc_dev, epoch) writer.add_scalar("Accuracy/test", acc_test, epoch) return max_acc