def main(): in_arg = get_input_args() data_dir = in_arg.data_dir save_dir = in_arg.save_dir arch = in_arg.arch learning_rate = in_arg.learning_rate hidden_units = in_arg.hidden_units epochs = in_arg.epochs #processing_unit = in_arg.gpu if torch.cuda.is_available() and in_arg.gpu == 'gpu': print('GPU will be used') processing_unit = 'gpu' elif torch.cuda.is_available() == False: print('CPU will be used') processing_unit = 'cpu' print(in_arg) training_dataloaders, validation_dataloaders, testing_dataloaders, class_to_idx = load_datas( data_dir) pre_model = pretrained_model(arch) model = classifier(pre_model, hidden_units) after_train_model = train_model(model, training_dataloaders, validation_dataloaders, learning_rate, epochs, processing_unit) valid_model(after_train_model, testing_dataloaders, processing_unit) save_checkpoint(model, save_dir, class_to_idx)
def main(): in_arg = get_input_args() # Creates and returns command line arguments print('\nData Directory:\n', in_arg.data_directory, '\n') print('Optional Command Line Arguments:\n', 'Save Checkpoint [--save_dir]: ', in_arg.save_dir, '\n', 'Pretrained Network [--arch]: ', in_arg.arch, '\n', 'Learning Rate [--learning_rate]: ', in_arg.learning_rate, '\n', 'Hidden Units [--hidden_units]: ', in_arg.hidden_units, '\n', 'Epochs [--epochs]: ', in_arg.epochs, '\n', 'GPU [--gpu]: ', in_arg.gpu, '\n') if 'checkpoints' not in listdir( ): # makes checkpoints folder if it doesn't already exist mkdir('checkpoints') train_dir, valid_dir, test_dir = util.get_data( in_arg.data_directory ) # Returns Train, Validation and Test Directories transformed_train, transformed_valid, transformed_test = mod.transform_data( train_dir, valid_dir, test_dir) # Returns transformed datasets train_loader, valid_loader, test_loader = mod.load_data( transformed_train, transformed_valid, transformed_test) # Returns Data loaders model = mod.build_model( util.label_count(train_dir), in_arg.hidden_units, in_arg.arch, transformed_train.class_to_idx) # Returns built model epochs = in_arg.epochs # Epochs initially set by command line argument in_arg.epochs. Can be changed with m.load_checkpoint() criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=in_arg.learning_rate) use_gpu = mod.use_gpu(model, in_arg.gpu) # Returns True or False for GPU use mod.train( model, criterion, optimizer, train_loader, valid_loader, use_gpu, in_arg.epochs ) # Trains the model. Prints Training Loss, Validation Loss & Validation Accuracy mod.save_checkpoint( in_arg.arch, model.classifier.state_dict(), transformed_train.class_to_idx, util.label_count(train_dir), in_arg.hidden_units, in_arg.epochs, in_arg.save_dir ) # Saves classifier and other model parameters to checkpoint
def save_ds_checkpoint(iteration, model, args): """Save a model checkpoint.""" sd = {} sd['iteration'] = iteration # rng states. if not args.no_save_rng: sd['random_rng_state'] = random.getstate() sd['np_rng_state'] = np.random.get_state() sd['torch_rng_state'] = torch.get_rng_state() sd['cuda_rng_state'] = torch.cuda.get_rng_state() sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states() model.save_checkpoint(args.save, str(iteration), client_state = sd)
def save_ds_checkpoint(iteration, model, lr_scheduler, args): """Save a model checkpoint.""" sd = {} sd['iteration'] = iteration if lr_scheduler is not None: sd['client_lr_scheduler'] = lr_scheduler.state_dict() # rng states. if not args.no_save_rng: sd['random_rng_state'] = random.getstate() sd['np_rng_state'] = np.random.get_state() sd['torch_rng_state'] = torch.get_rng_state() sd['cuda_rng_state'] = torch.cuda.get_rng_state() sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states() model.save_checkpoint(args.save, iteration, client_state=sd)
def train_teacher_model(model, labeled_dataset, optimizer, scheduler=None, train_ratio=0.7, batch_size=4, device='cpu', max_epochs=100, print_freq=10, save_path=None, checkpoint=None): model.to(device) metric_logger = utils.MetricLogger(delimiter=" ") last_loss = 1e9 cur_epoch = 0 if checkpoint is not None: print("loading checkpoint:" + checkpoint) model, optimizer, scheduler, cur_epoch = load_checkpoint( model, optimizer, scheduler, device, checkpoint) train_dataset, vld_dataset = split_dataset(labeled_dataset, train_ratio) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) vld_loader = DataLoader(vld_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) for epoch in range(cur_epoch, max_epochs): print("epoch {} / {}".format(epoch + 1, max_epochs)) train_one_epoch(model, optimizer, train_loader, device, epoch, print_freq) loss = evaluate(model, vld_loader, device, epoch, print_freq) if loss < last_loss and save_path != None: save_checkpoint(model, optimizer, scheduler, epoch + 1, device, save_path) last_loss = loss if scheduler is not None: scheduler.step()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("===> Building model") model = RRDBNet() #.to(device, dtype=torch.float) model = nn.DataParallel(model, device_ids=[0, 1, 2]) model.to(device) criterion = nn.MSELoss() for epoch in range(start_epoch, nEpochs + 1): optimizer = optim.Adam(model.parameters(), lr=initial_lr, weight_decay=1e-5) lr = adjust_learning_rate(initial_lr, optimizer, epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr model.train() for iteration, batch in enumerate(training_data_loader, 1): x_data, z_data = Variable(batch[0].float()).cuda(), Variable( batch[1].float()).cuda() output = model(z_data) loss = criterion(output, x_data) optimizer.zero_grad() loss.backward() optimizer.step() if iteration % 100 == 0: print("===> Epoch[{}]({}/{}): Loss: {:.10f}".format( epoch, iteration, len(training_data_loader), loss.item())) save_checkpoint(model, epoch, 'simple') save_checkpoint(model, epoch, 'simple')
def Main(): parser = argparse.ArgumentParser() parser.add_argument( "data_directory", help= "The parent directory to containing subfolders of train and test data.", type=str, default='flowers') parser.add_argument("--save_dir", help="The directory of checkpoint file to be saved.", type=str, default=None) parser.add_argument("--gpu", help="Use GPU instead of CPU.", action="store_true") parser.add_argument("--learning_rate", help="Set learning rate for training.", type=float, default=0.001) parser.add_argument("--hidden_units", nargs="+", type = int, default = [1000], \ help="Set a list of hidden units: e.g. say two hidden layers of 500 and 200 units, the input format: 500 [space] 200") parser.add_argument("-e", "--epochs", help="Set the number of training iterations.", type=int, default=10) parser.add_argument( "--skip_accuracy", help= "Skip the validation on training and testing set in each iteration, and reduce the training time.", action="store_true") parser.add_argument( "--arch", help="Pre-trained Model Options: 0: Densenet121, 1: VGG16, 2: AlexNet ", type=int, default=1) args = parser.parse_args() if args.gpu: device = 'cuda' print('Compute using GPU') else: device = 'cpu' print('Compute using CPU') data_dir = args.data_directory #'flowers' train_dir = data_dir + '/train' valid_dir = data_dir + '/valid' test_dir = data_dir + '/test' file_path = args.save_dir print('Training data directory: ', train_dir) print('Testing data directory: ', test_dir) print('Output checkpoint file directory: ', file_path) train_image_dataset, trainloader = loadData(train_dir, train=True, batch_size=128, shuffle=True) test_image_dataset, testloader = loadData(test_dir, train=False, batch_size=128, shuffle=True) train_size = len(trainloader.dataset.imgs) print('Total number of samples in the train set: ', train_size) hidden_layer = args.hidden_units model_name = args.arch print( 'Building the model with hidden layer: {}, using pre-trained model: {}' .format(hidden_layer, model_options[model_name])) model = build_model(model_name=model_name, hidden_layer=hidden_layer) epochs = args.epochs print('Number of iteration: ', epochs) learn_rate = args.learning_rate print('Using the learning rate: ', learn_rate) if args.skip_accuracy: print_accuracy = False print('Skip calculating the accuracy during the training.') else: print_accuracy = True print( 'Will calculate the accuracy during the training. Expected longer training time.' ) criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=learn_rate) print('Start training...') train(model, trainloader, testloader, criterion, optimizer, epochs=epochs, print_every=10, print_accuracy=print_accuracy, device=device) accuracy_score(model, trainloader, device=device, print_score=True) save_checkpoint(model, optimizer, train_image_dataset, epochs, file_path=file_path, file_name='checkpoint.pth', print_model=False)
step + (epoch - 1) * iters_per_epoch) add_summary_value(summary_w, 'lr', lr, step + (epoch - 1) * iters_per_epoch) loss_temp = 0 start = time.time() if args.mGPUs: save_name = os.path.join( './data/results', args.train_id, args.root_model, 'faster_rcnn_{}_{}.pth'.format(epoch, step)) save_checkpoint( { 'train_id': args.train_id, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': args.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) else: save_name = os.path.join( './data/results', args.train_id, args.root_model, 'faster_rcnn_{}_{}.pth'.format(epoch, step)) save_checkpoint( { 'train_id': args.train_id, 'epoch': epoch + 1, 'model': fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': args.POOLING_MODE,
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, eval_every: int = 5, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ writer: SummaryWriter = SummaryWriter() if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) total_training_exampels: int = 0 model.zero_grad() trainLoader = DataLoader(dataset=PickleDataset(train_dir), batch_size=batch_size, shuffle= False, num_workers=8) printTrace("Training...") iteration_no: int = 0 for epoch in range(num_epoch): #step_no: int = 0 #num_used_files: int = 0 print('EpochNum: ' + str(epoch)) model.train() start_time: float = time.time() running_loss: float = 0.0 acc_dev: float = 0.0 for num_batchs, inputs in enumerate(trainLoader): X_bacth = torch.reshape(inputs[0], (inputs[0].shape[0] * 5, 3, inputs[0].shape[2], inputs[0].shape[3])).to(device) y_batch = torch.reshape(inputs[1], (inputs[0].shape[0],)).long().to(device) #print(X_bacth) #X_bacth, y_batch = ( # torch.from_numpy(batch_data).to(device), # torch.from_numpy(inputs[1]).long().to(device), #) outputs = model.forward(X_bacth) #print(outputs.size()) #print(y_batch) loss = criterion(outputs, y_batch) / accumulation_steps running_loss += loss.item() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() model.zero_grad() #scheduler.step(running_loss) # Print Statistics printTrace( f"Loss: {running_loss/num_batchs}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss, iteration_no) if (iteration_no + 1) % eval_every == 0: start_time_eval: float = time.time() acc_dev: float = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test: float = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"Acc dev set: {round(acc_dev,2)}. " f"Acc test set: {round(acc_test,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs." ) if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {round(max_acc,2)}. Saving model..." ) save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) writer.add_scalar("Accuracy/dev", acc_dev, iteration_no) writer.add_scalar("Accuracy/test", acc_test, iteration_no) if save_checkpoints and (iteration_no + 1) % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) iteration_no += 1 return max_acc
add_summary_value(summary_w, 'eval_loss', loss_tt / len(pd_val.roidb), total_iters) add_summary_value(summary_w, 'mcls_sc', mcls_sc, total_iters) add_summary_value(summary_w, 'mcls_ac', mcls_ac, total_iters) add_summary_value(summary_w, 'mcls_ap', mcls_ap, total_iters) add_summary_value(summary_w, 'mins_sc', mins_sc, total_iters) add_summary_value(summary_w, 'mins_ac', mins_ac, total_iters) add_summary_value(summary_w, 'mins_ap', mins_ap, total_iters) save_name = os.path.join( './data/results', args.train_id, args.root_model, 'checkpoint{}_{}.pth'.format(epoch, total_iters)) save_checkpoint( { 'train_id': args.train_id, 'epoch': epoch + 1, 'model': basenet.state_dict(), 'optimizer': optimizer.state_dict(), }, save_name) print('save model: {}'.format(save_name)) end = time.time() print(end - start) if total_iters > args.max_iters: break if args.resume: total_iters -= (args.start_epoch - 1) * iters_per_epoch print('total train time: %.2f s, %.2f h' % (total_time, total_time / 3600.))
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, scaler: GradScaler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, running_loss: float, total_batches: int, total_training_examples: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], fp16: bool = True, save_checkpoints: bool = True, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ if not os.path.exists(output_dir): print(f"{output_dir} does not exits. We will create it.") os.makedirs(output_dir) writer: SummaryWriter = SummaryWriter() criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss().to(device) model.zero_grad() print_message("Training...") for epoch in range(num_epoch): acc_dev: float = 0.0 num_batches: int = 0 step_no: int = 0 data_loader_train = DataLoader( Tedd1104Dataset( dataset_dir=train_dir, hide_map_prob=hide_map_prob, dropout_images_prob=dropout_images_prob, ), batch_size=batch_size, shuffle=True, num_workers=os.cpu_count(), pin_memory=True, ) start_time: float = time.time() step_start_time: float = time.time() dataloader_delay: float = 0 model.train() for batch in data_loader_train: x = torch.flatten( torch.stack( ( batch["image1"], batch["image2"], batch["image3"], batch["image4"], batch["image5"], ), dim=1, ), start_dim=0, end_dim=1, ).to(device) y = batch["y"].to(device) dataloader_delay += time.time() - step_start_time total_training_examples += len(y) if fp16: with autocast(): outputs = model.forward(x) loss = criterion(outputs, y) loss = loss / accumulation_steps running_loss += loss.item() scaler.scale(loss).backward() else: outputs = model.forward(x) loss = criterion(outputs, y) / accumulation_steps running_loss += loss.item() loss.backward() if ((step_no + 1) % accumulation_steps == 0) or ( step_no + 1 >= len(data_loader_train) ): # If we are in the last bach of the epoch we also want to perform gradient descent if fp16: # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() optimizer.zero_grad() else: # Gradient clipping torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() optimizer.zero_grad() total_batches += 1 num_batches += 1 scheduler.step(running_loss / total_batches) batch_time = round(time.time() - start_time, 2) est: float = batch_time * (math.ceil( len(data_loader_train) / accumulation_steps) - num_batches) print_message( f"EPOCH: {initial_epoch + epoch}. " f"{num_batches} of {math.ceil(len(data_loader_train)/accumulation_steps)} batches. " f"Total examples used for training {total_training_examples}. " f"Iteration time: {batch_time} secs. " f"Data Loading bottleneck: {round(dataloader_delay, 2)} secs. " f"Epoch estimated time: " f"{str(datetime.timedelta(seconds=est)).split('.')[0]}") print_message( f"Loss: {running_loss / total_batches}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss / total_batches, total_batches) if save_checkpoints and (total_batches + 1) % save_every == 0: print_message("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, running_loss=running_loss, total_batches=total_batches, total_training_examples=total_training_examples, acc_dev=max_acc, epoch=initial_epoch + epoch, fp16=fp16, scaler=None if not fp16 else scaler, ) dataloader_delay: float = 0 start_time: float = time.time() step_no += 1 step_start_time = time.time() del data_loader_train print_message("Dev set evaluation...") start_time_eval: float = time.time() data_loader_dev = DataLoader( Tedd1104Dataset( dataset_dir=dev_dir, hide_map_prob=0, dropout_images_prob=[0, 0, 0, 0, 0], ), batch_size=batch_size // 2, # Use smaller batch size to prevent OOM issues shuffle=False, num_workers=os.cpu_count() // 2, # Use less cores to save RAM pin_memory=True, ) acc_dev: float = evaluate( model=model, data_loader=data_loader_dev, device=device, fp16=fp16, ) del data_loader_dev print_message("Test set evaluation...") data_loader_test = DataLoader( Tedd1104Dataset( dataset_dir=test_dir, hide_map_prob=0, dropout_images_prob=[0, 0, 0, 0, 0], ), batch_size=batch_size // 2, # Use smaller batch size to prevent OOM issues shuffle=False, num_workers=os.cpu_count() // 2, # Use less cores to save RAM pin_memory=True, ) acc_test: float = evaluate( model=model, data_loader=data_loader_test, device=device, fp16=fp16, ) del data_loader_test print_message( f"Acc dev set: {round(acc_dev*100,2)}. " f"Acc test set: {round(acc_test*100,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs.") if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev print_message( f"New max acc in dev set {round(max_acc, 2)}. Saving model...") save_model( model=model, save_dir=output_dir, fp16=fp16, ) writer.add_scalar("Accuracy/dev", acc_dev, epoch) writer.add_scalar("Accuracy/test", acc_test, epoch) return max_acc
pred_choice = outputs.data.max(1)[1] correct += pred_choice.eq(labels.data).cpu().sum() sum += len(labels) print('batch_index: [%d/%d]' % (batch_index, len(evalloader)), 'Eval epoch: [%d]' % (epoch), 'correct/sum:%d/%d, %.4f' % (correct, sum, correct / sum)) if __name__ == '__main__': # 是否装载模型参数 load = False if load: checkpoint = model.load_checkpoint() net.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] + 1 else: start_epoch = 0 # 设置优化器 optimizer = optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), weight_decay=0) # optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=1e-1, weight_decay=1e-4) for epoch in range(start_epoch, n_epoch): train(epoch) # 保存参数 checkpoint = {'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict()} model.save_checkpoint(checkpoint) eval(epoch)
def train( model: DRIVEMODEL, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, accumulation_steps: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, dropout_images_prob: List[float], num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, eval_every: int = 5, save_every: int = 20, save_best: bool = True, ): """ Train a model Input: - model: DRIVEMODEL model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (put a black square) from a training example (0<=hide_map_prob<=1) - dropout_images_prob List of 5 floats or None, probability for removing each input image during training (black image) from a training example (0<=dropout_images_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ writer: SummaryWriter = SummaryWriter() if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) total_training_exampels: int = 0 model.zero_grad() printTrace("Training...") for epoch in range(num_epoch): step_no: int = 0 iteration_no: int = 0 num_used_files: int = 0 data_loader = DataLoader_AutoDrive( dataset_dir=train_dir, nfiles2load=num_load_files_training, hide_map_prob=hide_map_prob, dropout_images_prob=dropout_images_prob, fp=16 if fp16 else 32, ) data = data_loader.get_next() # Get files in batches, all files will be loaded and data will be shuffled while data: X, y = data model.train() start_time: float = time.time() total_training_exampels += len(y) running_loss: float = 0.0 num_batchs: int = 0 acc_dev: float = 0.0 for X_bacth, y_batch in nn_batchs(X, y, batch_size): X_bacth, y_batch = ( torch.from_numpy(X_bacth).to(device), torch.from_numpy(y_batch).long().to(device), ) outputs = model.forward(X_bacth) loss = criterion(outputs, y_batch) / accumulation_steps running_loss += loss.item() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) if (step_no + 1) % accumulation_steps or ( num_used_files + 1 > len(data_loader) - num_load_files_training and num_batchs == math.ceil(len(y) / batch_size) - 1 ): # If we are in the last bach of the epoch we also want to perform gradient descent optimizer.step() model.zero_grad() num_batchs += 1 step_no += 1 num_used_files += num_load_files_training # Print Statistics printTrace( f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. " f"{num_used_files} of {len(data_loader)} files. " f"Total examples used for training {total_training_exampels}. " f"Iteration time: {round(time.time() - start_time,2)} secs.") printTrace( f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. " f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}" ) writer.add_scalar("Loss/train", running_loss / num_batchs, iteration_no) scheduler.step(running_loss / num_batchs) if (iteration_no + 1) % eval_every == 0: start_time_eval: float = time.time() if len(X) > 0 and len(y) > 0: acc_train: float = evaluate( model=model, X=torch.from_numpy(X), golds=y, device=device, batch_size=batch_size, ) else: acc_train = -1.0 acc_dev: float = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test: float = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"Acc training set: {round(acc_train,2)}. " f"Acc dev set: {round(acc_dev,2)}. " f"Acc test set: {round(acc_test,2)}. " f"Eval time: {round(time.time() - start_time_eval,2)} secs." ) if 0.0 < acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {round(max_acc,2)}. Saving model..." ) save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) if acc_train > -1: writer.add_scalar("Accuracy/train", acc_train, iteration_no) writer.add_scalar("Accuracy/dev", acc_dev, iteration_no) writer.add_scalar("Accuracy/test", acc_test, iteration_no) if save_checkpoints and (iteration_no + 1) % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, scheduler=scheduler, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) iteration_no += 1 data = data_loader.get_next() data_loader.close() return max_acc
if ckpt is not None: print('recovering from checkpoints...') model.load_state_dict(ckpt['model']) begin_epoch = ckpt['epoch'] + 1 print('resuming training') begin = time() with open(os.path.join('../logs', 'down_sample.log'), 'w') as f: for epoch in range(begin_epoch, 1000): epoch_loss = [] for bid, batch in enumerate(loader): hr, lr = batch['hr'].to(DEVICE), batch['lr'].to(DEVICE) optimizer.zero_grad() ds = model(hr) batch_loss = loss(ds, lr) batch_loss.backward() optimizer.step() epoch_loss.append(batch_loss.cpu().detach().numpy()) print( 'Epoch {} | Batch {} | BMSE {:6f} | EMSE {:.6f} | RT {:6f}' .format(epoch, bid, batch_loss, np.mean(epoch_loss), since(begin))) f.write('{},{},{},{},{}\n'.format(epoch, bid, batch_loss, np.mean(epoch_loss), since(begin))) f.flush() state_dict = {'model': model.state_dict(), 'epoch': epoch} save_checkpoint(state_dict, '../checkpoints/', model_name='down_sample')
def main(): parser = argparse.ArgumentParser( description= 'Train a new network on a dataset and save the model as a checkpoint') parser.add_argument('data_dir', metavar='path/to/dataset', type=str, nargs=1, help='path to a data directory') parser.add_argument( '--save_dir', metavar='path/to/save_dir', type=str, nargs='?', help='path to a directory in which to save a checkpoint') parser.add_argument('--learning_rate', metavar='learning rate', type=float, nargs='?', default=0.002, help='learning rate value for model training') parser.add_argument('--hidden_units', metavar='hidden units', type=int, nargs='?', default=512, help='number of hidden units for model classifier') parser.add_argument('--epochs', metavar='epochs', type=int, nargs='?', default=5, help='number of epochs for model training') parser.add_argument( '--arch', metavar='model name', type=str, nargs='?', default='densenet161', help='name of transfer model (e.g., resnet18 or densenet161)') parser.add_argument('--gpu', action='store_true', help='use GPU for model training (recommended)') args = parser.parse_args() data_dir = args.data_dir[0] save_dir = args.save_dir learning_rate = args.learning_rate hidden_units = args.hidden_units epochs = args.epochs model_name = args.arch use_gpu = args.gpu print('Using the following hyperparameters for training') print(f' Learning rate: {learning_rate}') print(f' Hidden units: {hidden_units}') print(f' Epochs: {epochs}') print(f'Transfer model name: {model_name}') if use_gpu and not cuda.is_available(): print('Error: GPU not available. Try again without the --gpu flag') exit(1) if use_gpu: print('Training on GPU...') else: print('Training on CPU...') print( 'Warning: training on CPU could take a LONG time. Consider using --gpu flag.' ) print('') if model_name not in ALLOWED_ARCHS: print( f'Error: Model architecture {model_name} is not currently supported.' ) print('Please try one of the following:') for a in ALLOWED_ARCHS: print(f' {a}') exit(1) dataloaders, image_datasets = prep_data(data_dir) model = build_model_from_pretrained(model_name, hidden_units, image_datasets['test'].class_to_idx) # print(model.classifier) with active_session(): trained, optimizer = train(model, dataloaders, learning_rate, epochs, use_gpu) if save_dir: save_checkpoint(trained, epochs, optimizer, model_name, learning_rate, save_dir)
from utils import load_data # Setup argparse arguments parser = argparse.ArgumentParser() parser.add_argument('data_dir', type=str) parser.add_argument('--save_dir', type=str, default='./') parser.add_argument('--arch', type=str, default='vgg16') parser.add_argument('--lr', type=float, default='0.01') parser.add_argument('--hidden_units', type=int, default=512) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--gpu', action='store_true') arg = parser.parse_args() if __name__ == '__main__': print('Loading data') trainloader, validloader, testloader, class_to_idx = load_data( arg.data_dir) print('Creating model') model = create_model(arg.arch, arg.hidden_units) print('Training model') train_model(model, arg.epochs, arg.lr, arg.gpu, trainloader, validloader) print('Saving model') model.class_to_idx = class_to_idx save_checkpoint(model, arg.save_dir + arg.arch + '.pth', arg.arch, arg.hidden_units, class_to_idx)
def main(): # Fetch user arguments user_input = get_train_arguments() data_dir = user_input.data_directory checkpoint_save_dir = user_input.save_dir architecture = user_input.arch classifier_hidden_units_list = user_input.hidden_units dropout = 0.2 epochs = user_input.epochs learn_rate = user_input.learning_rate print_every = user_input.print_every device = torch.device(m.determine_device(user_input.gpu)) with open('cat_to_name.json', 'r') as f: cat_to_name = json.load(f) classifier_outputs = len(cat_to_name) print("Configuring training session with the following parameters:\n", f"- Architecture: {architecture}\n", f"- Hidden layer inputs: {classifier_hidden_units_list}\n", f"- Classifier Outputs: {classifier_outputs}\n", f"- Dropout: {dropout}\n", f"- Epochs: {epochs}\n", f"- Learn Rate: {learn_rate}\n", f"- Device: {device}\n") # Load training and validation data training_dataloader, training_dataset = load.load_data( data_dir, icc.DIRECTORIES[icc.TRAIN_DIR]) validation_dataloader, validation_dataset = load.load_data( data_dir, icc.DIRECTORIES[icc.VALID_DIR]) # Load pre-trained model with new classifier model = m.build_model(architecture, classifier_hidden_units_list, classifier_outputs, dropout) model.to(device) # Create an optimizer to update the weights classifier_params = m.fetch_feedforward_classifier_parameters( model, architecture) optimizer = optim.Adam(classifier_params, lr=learn_rate) criterion = nn.NLLLoss() # Train print(f"\nStarting training: {datetime.datetime.now()}") start = time.time() train_losses, validation_losses = train(model, device, epochs, optimizer, criterion, training_dataloader, validation_dataloader, print_every) print( f"Finished training: {datetime.datetime.now()}. Total run time: {time.time() - start}" ) # Test (optional) if user_input.test: testing_dataloader, testing_dataset = load.load_data( data_dir, icc.DIRECTORIES[icc.TEST_DIR]) test_start = time.time() test(model, criterion, testing_dataloader, device) print(f"Testing time: {time.time() - test_start}") # Save checkpoint m.save_checkpoint(model, training_dataset.class_to_idx, optimizer, epochs, classifier_outputs, classifier_hidden_units_list, dropout, architecture, checkpoint_save_dir)
def train(args): original_image_shape = 1024 validation_frac = 0.10 df_train = pd.read_csv(args.labels) df_train = df_train.sample( frac=1, random_state=args.seed) # .sample(frac=1) does the shuffling pIds = [pId for pId in df_train["patientId"].unique()] pIds_valid = pIds[:int(round(validation_frac * len(pIds)))] pIds_train = pIds[int(round(validation_frac * len(pIds))):] print("{} patient IDs shuffled and {}% of them used in validation set.". format(len(pIds), validation_frac * 100)) print( "{} images went into train set and {} images went into validation set." .format(len(pIds_train), len(pIds_valid))) pId_boxes_dict = {} for pId in (df_train.loc[( df_train["Target"] == 1)]["patientId"].unique().tolist()): pId_boxes_dict[pId] = get_boxes_per_patient(df_train, pId) print("{} ({:.1f}%) images have target boxes.".format( len(pId_boxes_dict), 100 * (len(pId_boxes_dict) / len(pIds)))) transform = tv.transforms.Compose([tv.transforms.ToTensor()]) # create datasets dataset_train = PneumoniaDataset( root=args.data, pIds=pIds_train, predict=False, boxes=pId_boxes_dict, rescale_factor=args.rescale_factor, transform=transform, rotation_angle=3, warping=True, seed=args.seed, ) dataset_valid = PneumoniaDataset( root=args.data, pIds=pIds_valid, predict=False, boxes=pId_boxes_dict, rescale_factor=args.rescale_factor, transform=transform, rotation_angle=0, warping=False, seed=args.seed, ) # define the dataloaders with the previous dataset loader_train = DataLoader( dataset=dataset_train, batch_size=args.batch_size, shuffle=True, pin_memory=True, ) loader_valid = DataLoader( dataset=dataset_valid, batch_size=args.batch_size, shuffle=True, pin_memory=True, ) # Check if train images have been properly loaded print("{} images in train set and {} images in validation set.".format( len(dataset_train), len(dataset_valid))) # img_batch, target_batch, pId_batch = next(iter(loader_train)) print("Tensor batch size:", img_batch.size()) for i in np.random.choice(len(dataset_train), size=5, replace=False): img, target, pId = dataset_train[i] # picking an image with pneumonia print("\nImage and mask shapes:", img.shape, target.shape) print("Patient ID:", pId) print("Image scale: {} - {}".format(img[0].min(), img[0].max())) print("Target mask scale: {} - {}".format(target[0].min(), target[0].max())) # define an instance of the model model = PneumoniaUNET( bn_momentum=args.bn_momentum, eps=args.bn_eps, alpha_leaky=args.alpha_leaky, ).cuda() print(model) # define the loss function loss_fn = BCEWithLogitsLoss2d().cuda() num_epochs = 2 if args.debug else args.epochs num_steps_train = 50 if args.debug else len(loader_train) num_steps_eval = 10 if args.debug else len(loader_valid) shape = int(round(original_image_shape / args.rescale_factor)) histories, best_models = train_and_evaluate( model, loader_train, loader_valid, args.learning_rate, args.optimizer, args.learning_rate_decay, args.momentum, args.eps, args.weight_decay, loss_fn, num_epochs, num_steps_train, num_steps_eval, pId_boxes_dict, args.rescale_factor, shape, save_path=args.save, restore_file=args.checkpoint, ) best_model = best_models["best precision model"] dataset_valid = PneumoniaDataset( root=args.data, pIds=pIds_valid[:100] if args.debug else pIds_valid, predict=True, boxes=None, rescale_factor=args.rescale_factor, transform=transform, seed=args.seed, ) loader_valid = DataLoader(dataset=dataset_valid, batch_size=args.batch_size, shuffle=False) predictions_valid = predict(best_model, loader_valid) if args.train_threshold and not args.debug: ( best_threshold, best_avg_precision_valid, thresholds, avg_precision_valids, ) = train_threshold( dataset_valid, predictions_valid, pId_boxes_dict, args.rescale_factor, ) print(best_threshold) print(best_avg_precision_valid) print(thresholds) print(avg_precision_valids) else: best_threshold = args.box_threshold if args.save: os.makedirs(f"{args.save}/images", exist_ok=True) img_precisions = evaluate_threshold( dataset_valid, predictions_valid, best_threshold, pId_boxes_dict, args.rescale_factor, image_save_path=f"{args.save}/images" if args.save_images else None, ) print( f"Total Average Precision: {np.nansum(img_precisions) / len(img_precisions)}" ) save_checkpoint( { "best_threshold": best_threshold, "state_dict": best_model.state_dict(), }, args.save, is_final=True, )
def train_and_evaluate( model, train_dataloader, val_dataloader, lr_init, optimizer_type, lr_decay, momentum, eps, wd, loss_fn, num_epochs, num_steps_train, num_steps_eval, pId_boxes_dict, rescale_factor, shape, save_path=None, restore_file=None, ): # reload weights from restore_file if specified if restore_file is not None: checkpoint = torch.load(restore_file) model.load_state_dict(checkpoint["state_dict"]) writer = SummaryWriter(f"{os.environ.get('TRAINML_OUTPUT_PATH')}/logs") best_val_loss = 1e15 best_val_prec = 0.0 best_loss_model = None best_prec_model = None loss_t_history = [] loss_v_history = [] loss_avg_t_history = [] prec_t_history = [] prec_v_history = [] for epoch in range(num_epochs): start = time.time() # define the optimizer if optimizer_type == "adagrad": lr = lr_init optimizer = torch.optim.Adagrad( model.parameters(), lr=lr, lr_decay=lr_decay, weight_decay=wd, ) else: lr = lr_init * 0.5**float( epoch) # reduce the learning rate at each epoch if optimizer_type == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=eps, weight_decay=wd) elif optimizer_type == "adamw": optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=eps, weight_decay=wd) elif optimizer_type == "adamax": optimizer = torch.optim.Adamax(model.parameters(), lr=lr, eps=eps, weight_decay=wd) elif optimizer_type == "sgd": optimizer = torch.optim.SGD( model.parameters(), lr=lr, momentum=momentum, weight_decay=wd, ) else: raise ValueError( 'Invalid optimizer_type, allowed values are "adam", "adamw", "adamax", "sgd", "adagrad"' ) # Run one epoch print("Epoch {}/{}. Learning rate = {:05.3f}.".format( epoch + 1, num_epochs, lr)) # train model for a whole epoc (one full pass over the training set) loss_avg_t_hist_ep, loss_t_hist_ep, prec_t_hist_ep = train_model( model, train_dataloader, optimizer, loss_fn, num_steps_train, pId_boxes_dict, rescale_factor, shape, writer, epoch=epoch, ) loss_avg_t_history += loss_avg_t_hist_ep loss_t_history += loss_t_hist_ep prec_t_history += prec_t_hist_ep # Evaluate for one epoch on validation set val_metrics = evaluate_model( model, val_dataloader, loss_fn, num_steps_eval, pId_boxes_dict, rescale_factor, shape, writer, epoch=epoch, ) val_loss = val_metrics["loss"] val_prec = val_metrics["precision"] loss_v_history += len(loss_t_hist_ep) * [val_loss] prec_v_history += len(prec_t_hist_ep) * [val_prec] is_best_loss = val_loss <= best_val_loss is_best_prec = val_prec >= best_val_prec if is_best_loss: print("- Found new best loss: {:.4f}".format(val_loss)) best_val_loss = val_loss best_loss_model = model if is_best_prec: print("- Found new best precision: {:.4f}".format(val_prec)) best_val_prec = val_prec best_prec_model = model # Save best weights based on best_val_loss and best_val_prec if save_path: save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "optim_dict": optimizer.state_dict(), }, save_path, is_best=is_best_loss, metric="loss", ) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "optim_dict": optimizer.state_dict(), }, save_path, is_best=is_best_prec, metric="prec", ) delta_time = time.time() - start print("Epoch run in {:.2f} minutes".format(delta_time / 60.0)) histories = { "loss avg train": loss_avg_t_history, "loss train": loss_t_history, "precision train": prec_t_history, "loss validation": loss_v_history, "precision validation": prec_v_history, } best_models = { "best loss model": best_loss_model, "best precision model": best_prec_model, } return histories, best_models
def train( model: TEDD1104, optimizer_name: str, optimizer: torch.optim, train_dir: str, dev_dir: str, test_dir: str, output_dir: str, batch_size: int, initial_epoch: int, num_epoch: int, max_acc: float, hide_map_prob: float, num_load_files_training: int, fp16: bool = True, amp_opt_level=None, save_checkpoints: bool = True, save_every: int = 100, save_best: bool = True, ): """ Train a model Input: - model: TEDD1104 model to train - optimizer_name: Name of the optimizer to use [SGD, Adam] - optimizer: Optimizer (torch.optim) - train_dir: Directory where the train files are stored - dev_dir: Directory where the development files are stored - test_dir: Directory where the test files are stored - output_dir: Directory where the model and the checkpoints are going to be saved - batch_size: Batch size (Around 10 for 8GB GPU) - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been restored from checkpoint) - num_epochs: Number of epochs to do - max_acc: Accuracy in the development set (0 unless the model has been restored from checkpoint) - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1) - fp16: Use FP16 for training - amp_opt_level: If FP16 training Nvidia apex opt level - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one) - save_best: save the model that achieves the higher accuracy in the development set Output: - float: Accuracy in the development test of the best model """ if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss() print("Loading dev set") X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32) X_dev = torch.from_numpy(X_dev) print("Loading test set") X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32) X_test = torch.from_numpy(X_test) acc_dev: float = 0.0 total_training_exampels: int = 0 printTrace("Training...") for epoch in range(num_epoch): iteration_no = 0 num_used_files: int = 0 files: List[str] = glob.glob(os.path.join(train_dir, "*.npz")) random.shuffle(files) # Get files in batches, all files will be loaded and data will be shuffled for paths in batch(files, num_load_files_training): iteration_no += 1 num_used_files += num_load_files_training model.train() start_time: float = time.time() X, y = load_and_shuffle_datasets(paths=paths, fp=16 if fp16 else 32, hide_map_prob=hide_map_prob) total_training_exampels += len(y) running_loss = 0.0 num_batchs = 0 for X_bacth, y_batch in nn_batchs(X, y, batch_size): X_bacth, y_batch = ( torch.from_numpy(X_bacth).to(device), torch.from_numpy(y_batch).long().to(device), ) optimizer.zero_grad() outputs = model.forward(X_bacth) loss = criterion(outputs, y_batch) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() running_loss += loss.item() num_batchs += 1 start_time_eval: float = time.time() # Print Statistics if len(X) > 0 and len(y) > 0: acc_train = evaluate( model=model, X=torch.from_numpy(X), golds=y, device=device, batch_size=batch_size, ) else: acc_train = -1.0 acc_dev = evaluate( model=model, X=X_dev, golds=y_dev, device=device, batch_size=batch_size, ) acc_test = evaluate( model=model, X=X_test, golds=y_test, device=device, batch_size=batch_size, ) printTrace( f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. " f"{num_used_files} of {len(files)} files. " f"Total examples used for training {total_training_exampels}. " f"Iteration time: {time.time() - start_time} secs. Eval time: {time.time() - start_time_eval} secs." ) printTrace( f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. Acc training set: {acc_train}. " f"Acc dev set: {acc_dev}. Acc test set: {acc_test}") if acc_dev > max_acc and save_best: max_acc = acc_dev printTrace( f"New max acc in dev set {max_acc}. Saving model...") save_model( model=model, save_dir=output_dir, fp16=fp16, amp_opt_level=amp_opt_level, ) if save_checkpoints and iteration_no % save_every == 0: printTrace("Saving checkpoint...") save_checkpoint( path=os.path.join(output_dir, "checkpoint.pt"), model=model, optimizer_name=optimizer_name, optimizer=optimizer, acc_dev=acc_dev, epoch=initial_epoch + epoch, fp16=fp16, opt_level=amp_opt_level, ) return max_acc
device = "cuda:0" else : # So if cuda is not available don't do some unexpected things, just raise an error. raise ValueError("We wanted to execute this training on GPU, but cuda is not available!!\nPlease remove the -g option or make sure cuda is available.") else : device = 'cpu' print("The training is done on {}".format(device)) if args.checkpoint_dir : ckp_filepath = args.checkpoint_dir + ckp_fileprefix + args.architecture + ".pth" else : ckp_filepath = ckp_fileprefix + args.architecture + ".pth" if os.path.isfile(ckp_filepath) : print("Checkpoint {} recognized, continue training this model!".format(ckp_filepath)) model = mo.load_checkpoint(ckp_filepath, device) else : print("Checkpoint {} not recognized, starting from scratch!".format(ckp_filepath)) model = mo.init_model(args.directory, args.architecture, args.learning_rate, args.hidden_units) # Create an object where we can iterate over the data dataloaders, img_datasets, _ = ut.get_data_loader(args.directory) # Is usefull in the do_training function dataset_sizes = {x: len(img_datasets[x]) for x in ['train', 'valid', 'test']} # Now we are ready to do some training model = mo.do_training(model, dataloaders, dataset_sizes, device, epochs = args.epochs) # Done with the f*****g training, now save the network again mo.save_checkpoint(model, args.architecture, img_datasets['train'], ckp_filepath)
def main(): # Measure total program runtime by collecting start time start_time = time() # Create & retrieve Command Line Arugments in_arg = get_input_args() # Set device to cuda if gpu flag is set device = 'cuda' if in_arg.gpu == True else 'cpu' # Load the data train_dir = in_arg.data_dir + '/train' valid_dir = in_arg.data_dir + '/valid' test_dir = in_arg.data_dir + '/test' # Load the datasets with ImageFolder train_datasets = datasets.ImageFolder(train_dir, transform=train_transforms) valid_datasets = datasets.ImageFolder(valid_dir, transform=test_transforms) test_datasets = datasets.ImageFolder(test_dir, transform=test_transforms) # Using the image datasets and the transforms, define the dataloaders trainloader = torch.utils.data.DataLoader(train_datasets, batch_size=32, shuffle=True) validloader = torch.utils.data.DataLoader(valid_datasets, batch_size=32) testloader = torch.utils.data.DataLoader(test_datasets, batch_size=32) # Get the number of classes as the size of the output layer output_size = len(train_datasets.classes) # Build the model model = build_model(in_arg.arch, output_size, in_arg.hidden_units) # Insert mapping from class to index and index to class model.class_to_idx = train_datasets.class_to_idx model.idx_to_class = {i: c for c, i in model.class_to_idx.items()} # Move model to cuda before constructing the optimizer model = model.to(device) # Define criterion criterion = nn.NLLLoss() # Define optimizer # Only train the classifier parameters, feature parameters are frozen (p.requires_grad == false) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=in_arg.learning_rate) # Train the model train_model(model, trainloader, validloader, in_arg.epochs, criterion, optimizer, device) # Test the model print('Start testing') _, test_accuracy = validate_model(model, testloader, criterion, device) print('Finished testing') print( "Accuracy of the model on the test set: {:.3f}".format(test_accuracy)) # Save checkpoint save_checkpoint(in_arg.save_path, model, in_arg.arch, output_size, in_arg.epochs, optimizer) # Measure total program runtime by collecting end time end_time = time() # Computes overall runtime in seconds & prints it in hh:mm:ss format tot_time = end_time - start_time print( "\n** Total Elapsed Runtime:", str(int((tot_time / 3600))) + ":" + str(int( (tot_time % 3600) / 60)) + ":" + str(int((tot_time % 3600) % 60)))
from image import load_images from workspace_utils import active_session parser = argparse.ArgumentParser(description='Training image classifier') parser.add_argument('data_directory', help='image data directory path with train/valid/test subfolders') parser.add_argument('--save_dir', help='Model checkpoint saving', default='.') parser.add_argument('--arch', help='Pretrained model from torchvision', default='vgg16') parser.add_argument('--learning_rate', type=float, help='Optimizer learning rate', default=0.003) parser.add_argument('--hidden_units', type=int, help='Number of hidden units in customized classifier', default=256) parser.add_argument('--epochs', type=int, help='Number of training epochs', default=30) parser.add_argument('--gpu', action='store_true', default=False, help='Flag to set using GPU') args = parser.parse_args() print('Hyperparameters:', args) dataloaders, class_to_idx = load_images(args.data_directory) device = torch.device("cuda" if args.gpu and torch.cuda.is_available() else "cpu") with active_session(): print("Start loading model...") model = get_torchvision_model(args.arch, args.hidden_units) print(model.classifier) print("Start training model...") train_model(args.learning_rate, model, args.epochs, device, dataloaders) save_path = save_checkpoint(model, args.save_dir, args.hidden_units, args.arch, class_to_idx) print('Training complete. Model checkpoint is save at: %s' % save_path)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu setup_default_logging() _logger = logging.getLogger('train') if args.gpu is not None: _logger.info("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank, ) args.verbose = not args.distributed or (args.distributed and args.rank % ngpus_per_node == 0) if args.verbose: _logger.info("create model {}".format(args.arch)) model = create_model(args.arch, args.num_classes, args.pretrained) if args.distributed: # 单进程单卡训练 if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) # 单进程多卡训练 else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: if args.verbose: _logger.warning("no gpu for training, using cpu") optimizer = optim.Adam(model.parameters(), args.lr) start_epoch = args.start_epoch if args.initial_checkpoint is not None: if os.path.isfile(args.initial_checkpoint): if args.verbose: _logger.info("initializing model from '{}'".format( args.initial_checkpoint)) if args.gpu is None: checkpoint = torch.load(args.initial_checkpoint) else: checkpoint = torch.load(args.initial_checkpoint, map_location='cuda:{}'.format( args.gpu)) state_dict = checkpoint['state_dict'] model.load_state_dict(state_dict) if args.verbose: _logger.info("initialized model from '{}'".format( args.initial_checkpoint)) if args.resume is not None: if os.path.isfile(args.resume): if args.verbose: _logger.info("loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: checkpoint = torch.load(args.resume, map_location='cuda:{}'.format( args.gpu)) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.verbose: _logger.info("loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=args.patience_epochs, verbose=args.verbose) if args.vdata is not None and args.val_csv is not None: train_df = pd.read_csv(args.csv) train_set = create_dataset(args.data, train_df, args.mean, args.std, args.multi, evaluate=False) val_df = pd.read_csv(args.val_csv) val_set = create_dataset(args.vdata, val_df, args.mean, args.std, args.multi, evaluate=False) else: if args.multi: assert args.csv is not None, "Please specify annotation file" df = pd.read_csv(args.csv) val_df = df.sample(frac=args.test_split, random_state=args.seed) train_df = df.drop(val_df.index) train_set = create_dataset(args.data, train_df, args.mean, args.std, multi=True, train=True, evaluate=False) val_set = create_dataset(args.data, val_df, args.mean, args.std, multi=True, train=True, evaluate=False) else: df = pd.read_csv(args.csv) if args.csv is not None else None dataset = create_dataset(args.data, df, args.mean, args.std, multi=False, train=True, evaluate=False) train_set = copy.deepcopy(dataset) val_set = copy.deepcopy(dataset) kf = StratifiedShuffleSplit(n_splits=1, test_size=args.test_split, random_state=args.seed) train_idx, val_idx = next(kf.split(dataset.paths, dataset.targets)) train_set.paths = [dataset.paths[i] for i in train_idx] train_set.targets = [dataset.targets[i] for i in train_idx] val_set.paths = [dataset.paths[i] for i in val_idx] val_set.targets = [dataset.targets[i] for i in val_idx] # val_set.transforms = transforms.Compose([transforms.ToTensor()]) if args.verbose: _logger.info("Training set:\n{}".format(train_set)) _logger.info("Validation set:\n{}".format(val_set)) if args.distributed: train_sampler = DistributedSampler( dataset=train_set, shuffle=True, ) val_sampler = DistributedSampler( dataset=val_set, shuffle=False, ) else: train_sampler = None val_sampler = None train_loader = DataLoader( dataset=train_set, batch_size=args.batch_size, shuffle=not args.distributed, sampler=train_sampler, num_workers=args.workers, pin_memory=True, drop_last=True, ) val_loader = DataLoader( dataset=val_set, batch_size=args.batch_size, shuffle=False, sampler=val_sampler, num_workers=args.workers, pin_memory=True, ) if args.multi: train_criterion = nn.BCELoss().cuda(args.gpu) val_criterion = nn.BCELoss().cuda(args.gpu) else: train_criterion = nn.CrossEntropyLoss().cuda(args.gpu) val_criterion = nn.CrossEntropyLoss().cuda(args.gpu) best_metric = None for epoch in range(start_epoch, args.epochs): train(args, epoch, model, train_loader, optimizer, train_criterion, logger=_logger) val_loss, val_acc, val_recal = validate(args, epoch, model, val_loader, val_criterion, logger=_logger) scheduler.step(val_loss) if best_metric is not None and val_loss < best_metric: is_best = True best_metric = val_loss elif best_metric is None: is_best = True best_metric = val_loss else: is_best = False if args.verbose: checkpoint = { 'epoch': epoch, 'state_dict': model.state_dict() if not args.distributed else model.module.state_dict(), 'optimizer': optimizer.state_dict() } save_checkpoint(checkpoint, args.output, epoch, val_loss, val_acc, is_best) dist.barrier()
running_loss = 0. running_acc = 0. for inputs, labels in tqdm(train_dataloader): inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(True): predicts = model(inputs) loss_value = loss(predicts, labels) predicts_class = predicts.argmax(dim=1) loss_value.backward() optimizer.step() running_loss += loss_value.item() running_acc += (predicts_class == labels.data).float().mean() epoch_loss = running_loss / len(train_dataloader) epoch_acc = running_acc / len(train_dataloader) train_history_loss.append(epoch_loss) train_history_acc.append(epoch_acc) print('Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc), flush=True) if epoch % 10 == 0: model.save_checkpoint(epoch, model, optimizer, epoch_loss, path_model_save) print('end train')
def self_training(model, labeled_dataset, unlabeled_dataset, optimizer, scheduler=None, batch_size=4, train_ratio=0.7, score_threshold=0.7, unlabeled_loss_weight=0.1, relabel_step=None, device='cpu', max_epochs=100, print_freq=10, save_path=None, checkpoint=None): model.to(device) metric_logger = utils.MetricLogger(delimiter=" ") last_loss = 1e9 cur_epoch = 0 # train_labeled_dataset, val_labeled_dataset = split_dataset(labeled_dataset, train_ratio) # train_unlabeled_dataset, val_unlabeled_dataset = split_dataset(unlabeled_dataset, train_ratio) dataset_path = os.path.join(save_path, 'dataset') if checkpoint is not None: print("loading checkpoint:" + checkpoint) model, optimizer, scheduler, cur_epoch = load_checkpoint( model, optimizer, scheduler, device, checkpoint) for epoch in range(cur_epoch, max_epochs): print("epoch {} / {}".format(epoch + 1, max_epochs)) with open(os.path.join(dataset_path, 'train_labeled_dataset.pickle'), 'rb') as handle: train_labeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'val_labeled_dataset.pickle'), 'rb') as handle: val_labeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'train_unlabeled_dataset.pickle'), 'rb') as handle: train_unlabeled_dataset = pickle.load(handle) with open(os.path.join(dataset_path, 'val_unlabeled_dataset.pickle'), 'rb') as handle: val_unlabeled_dataset = pickle.load(handle) train_unlabeled_dataset = convert_subset(train_unlabeled_dataset) val_unlabeled_dataset = convert_subset(val_unlabeled_dataset) labeled_train_loader = DataLoader(train_labeled_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=True) labeled_vld_loader = DataLoader(val_labeled_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=False) pseudo_train = FLIRPseudoDataset(model, train_unlabeled_dataset, batch_size=batch_size, device=device, score_threshold=score_threshold) pseudo_val = FLIRPseudoDataset(model, val_unlabeled_dataset, batch_size=batch_size, device=device, score_threshold=score_threshold) unlabeled_train_loader = DataLoader(pseudo_train, collate_fn=collate_fn, batch_size=batch_size, shuffle=True) unlabeled_vld_loader = DataLoader(pseudo_val, collate_fn=collate_fn, batch_size=batch_size, shuffle=False) train_label_loss = train_one_epoch_self_training( model, optimizer, labeled_train_loader, 1, device, epoch, print_freq) train_loss = train_one_epoch_self_training(model, optimizer, unlabeled_train_loader, unlabeled_loss_weight, device, epoch, print_freq) train_loss = train_label_loss + unlabeled_loss_weight * train_loss all_training_loss.append(train_loss) coco_evaluate(model, labeled_vld_loader, device) # labeled_loss = evaluate(model, vld_loader, device, epoch, print_freq) coco_evaluate(model, unlabeled_vld_loader, device) # unlabeled_loss = evaluate(model, vld_loader, device, epoch, print_freq) # loss = labeled_loss + unlabeled_loss_weight * unlabeled_loss loss = 0 all_evaluation_loss.append(loss) if save_path is not None: save_checkpoint(model, optimizer, scheduler, epoch + 1, device, save_path) last_loss = loss print("epoch {}, train loss {}, validation loss {}".format( epoch + 1, train_loss, loss)) if scheduler is not None: scheduler.step()
print('-' * 89) generate_output(args, epoch, model, gen_dataset, startPoint=1500) if epoch % args.save_interval == 0: # Save the model if the validation loss is the best we've seen so far. is_best = val_loss > best_val_loss best_val_loss = max(val_loss, best_val_loss) model_dictionary = { 'epoch': epoch, 'best_loss': best_val_loss, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args } model.save_checkpoint(model_dictionary, is_best) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Calculate mean and covariance for each channel's prediction errors, and save them with the trained model print('=> calculating mean and covariance') means, covs = list(), list() train_dataset = TimeseriesData.batchify(args, TimeseriesData.trainData, bsz=1) for channel_idx in range(model.enc_input_size): mean, cov = fit_norm_distribution_param( args, model, train_dataset[:TimeseriesData.length], channel_idx) means.append(mean), covs.append(cov) model_dictionary = { 'epoch': max(epoch, start_epoch),