def train(train_loader, model, criterion, optimizer, scheduler, num_classes, batch_size, task, ep_idx, progress_log, num_devices): """ Train the model and return the metrics of the training epoch :param train_loader: training data loader :param model: model to train :param criterion: loss criterion :param optimizer: optimizer to use :param scheduler: learning rate scheduler :param num_classes: number of classes :param batch_size: number of samples to process simultaneously :param task: segmentation or classification :param ep_idx: epoch index (for hypertrainer log) :param progress_log: progress log file (for hypertrainer log) :param num_devices: (int) number of GPU devices to use. :return: Updated training loss """ model.train() train_metrics = create_metrics_dict(num_classes) for index, data in enumerate(train_loader): progress_log.open('a', buffering=1).write( tsv_line(ep_idx, 'trn', index, len(train_loader), time.time())) if task == 'classification': inputs, labels = data if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() outputs = model(inputs) outputs_flatten = outputs elif task == 'segmentation': if num_devices > 0: inputs = data['sat_img'].cuda() labels = flatten_labels(data['map_img']).cuda() else: inputs = data['sat_img'] labels = flatten_labels(data['map_img']) # forward optimizer.zero_grad() outputs = model(inputs) outputs_flatten = flatten_outputs(outputs, num_classes) del outputs del inputs loss = criterion(outputs_flatten, labels) train_metrics['loss'].update(loss.item(), batch_size) loss.backward() optimizer.step() scheduler.step() print('Training Loss: {:.4f}'.format(train_metrics['loss'].avg)) return train_metrics
def train(train_loader, model, criterion, optimizer, scheduler, num_classes, batch_size, classifier, ep_idx, progress_log): """ Train the model and return the metrics of the training phase. Args: train_loader: training data loader model: model to train criterion: loss criterion optimizer: optimizer to use scheduler: learning rate scheduler num_classes: number of classes batch_size: number of samples to process simultaneously classifier: True if doing a classification task, False if doing semantic segmentation ep_idx: epoch idx (for hypertrainer log) progress_log: progress log file (for hypertrainer log) """ model.train() scheduler.step() train_metrics = create_metrics_dict(num_classes) for index, data in enumerate(train_loader): progress_log.open('a', buffering=1).write( tsv_line(ep_idx, 'trn', index, len(train_loader), time.time())) if classifier: inputs, labels = data if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() optimizer.zero_grad() outputs = model(inputs) outputs_flatten = outputs else: if torch.cuda.is_available(): inputs = data['sat_img'].cuda() labels = flatten_labels(data['map_img']).cuda() else: inputs = data['sat_img'] labels = flatten_labels(data['map_img']) # forward optimizer.zero_grad() outputs = model(inputs) outputs_flatten = flatten_outputs(outputs, num_classes) del outputs del inputs loss = criterion(outputs_flatten, labels) train_metrics['loss'].update(loss.item(), batch_size) loss.backward() optimizer.step() print('Training Loss: {:.4f}'.format(train_metrics['loss'].avg)) return train_metrics
def validation(valid_loader, model, criterion, num_classes, batch_size, classifier, ep_idx, progress_log, batch_metrics=None): """Args: valid_loader: validation data loader model: model to validate criterion: loss criterion num_classes: number of classes batch_size: number of samples to process simultaneously classifier: True if doing a classification task, False if doing semantic segmentation ep_idx: epoch idx (for hypertrainer log) progress_log: progress log file (for hypertrainer log) batch_metrics: (int) Metrics computed every (int) batches. If left blank, will not perform metrics. """ valid_metrics = create_metrics_dict(num_classes) model.eval() for index, data in enumerate(valid_loader): progress_log.open('a', buffering=1).write( tsv_line(ep_idx, 'val', index, len(valid_loader), time.time())) with torch.no_grad(): if classifier: inputs, labels = data if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() outputs = model(inputs) outputs_flatten = outputs else: if torch.cuda.is_available(): inputs = data['sat_img'].cuda() labels = flatten_labels(data['map_img']).cuda() else: inputs = data['sat_img'] labels = flatten_labels(data['map_img']) outputs = model(inputs) outputs_flatten = flatten_outputs(outputs, num_classes) loss = criterion(outputs_flatten, labels) valid_metrics['loss'].update(loss.item(), batch_size) # Compute metrics every 2 batches. Time consuming. if batch_metrics is not None: if index % batch_metrics == 0: a, segmentation = torch.max(outputs_flatten, dim=1) valid_metrics = report_classification( segmentation, labels, batch_size, valid_metrics) print('Validation Loss: {:.4f}'.format(valid_metrics['loss'].avg)) if batch_metrics is not None: print('Validation precision: {:.4f}'.format( valid_metrics['precision'].avg)) print('Validation recall: {:.4f}'.format(valid_metrics['recall'].avg)) print('Validation f1-score: {:.4f}'.format( valid_metrics['fscore'].avg)) return valid_metrics
def main(bucket_name, data_path, output_path, num_trn_samples, num_val_samples, pretrained, batch_size, num_epochs, learning_rate, weight_decay, step_size, gamma, num_classes, class_weights, batch_metrics, model, classifier, model_name): """Function to train and validate a models for semantic segmentation. Args: bucket_name: bucket in which data is stored if using AWS S3 data_path: full file path of the folder containing h5py files output_path: full file path in which the model will be saved num_trn_samples: number of training samples num_val_samples: number of validation samples pretrained: booleam indicating if the model is pretrained batch_size: number of samples to process simultaneously num_epochs: number of epochs learning_rate: learning rate weight_decay: weight decay step_size: step size gamma: multiplicative factor of learning rate decay num_classes: number of classes class_weights: weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class batch_metrics:(int) Metrics computed every (int) batches. If left blank, will not perform metrics. model: CNN model (tensor) classifier: True if doing image classification, False if doing semantic segmentation. model_name: name of the model used for training. Returns: Files 'checkpoint.pth.tar' and 'last_epoch.pth.tar' containing trained weight """ if bucket_name: if output_path is None: bucket_output_path = None else: bucket_output_path = output_path output_path = 'output_path' try: os.mkdir(output_path) except FileExistsError: pass s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) if classifier: for i in ['trn', 'val']: get_s3_classification_images(i, bucket, bucket_name, data_path, output_path, num_classes) class_file = os.path.join(output_path, 'classes.csv') if bucket_output_path: bucket.upload_file( class_file, os.path.join(bucket_output_path, 'classes.csv')) else: bucket.upload_file(class_file, 'classes.csv') data_path = 'Images' else: if data_path: bucket.download_file( os.path.join(data_path, 'samples/trn_samples.hdf5'), 'samples/trn_samples.hdf5') bucket.download_file( os.path.join(data_path, 'samples/val_samples.hdf5'), 'samples/val_samples.hdf5') else: bucket.download_file('samples/trn_samples.hdf5', 'samples/trn_samples.hdf5') bucket.download_file('samples/val_samples.hdf5', 'samples/val_samples.hdf5') verify_sample_count(num_trn_samples, num_val_samples, data_path, bucket_name) elif classifier: get_local_classes(num_classes, data_path, output_path) else: verify_sample_count(num_trn_samples, num_val_samples, data_path, bucket_name) verify_weights(num_classes, class_weights) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): # Add header progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') if torch.cuda.is_available(): model = model.cuda() if class_weights: criterion = nn.CrossEntropyLoss( weight=torch.tensor(class_weights)).cuda() else: criterion = nn.CrossEntropyLoss().cuda() else: if class_weights: criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights)) else: criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=step_size, gamma=gamma) # learning rate decay if pretrained != '': model, optimizer = load_from_checkpoint(pretrained, model, optimizer) if classifier: trn_dataset = torchvision.datasets.ImageFolder( os.path.join(data_path, "trn"), transform=transforms.Compose([ transforms.RandomRotation((0, 275)), transforms.RandomHorizontalFlip(), transforms.Resize(299), transforms.ToTensor() ]), loader=loader) val_dataset = torchvision.datasets.ImageFolder( os.path.join(data_path, "val"), transform=transforms.Compose( [transforms.Resize(299), transforms.ToTensor()]), loader=loader) else: if not bucket_name: trn_dataset = CreateDataset.SegmentationDataset( os.path.join(data_path, "samples"), num_trn_samples, "trn", transform=transforms.Compose([ aug.RandomRotationTarget(), aug.HorizontalFlip(), aug.ToTensorTarget() ])) val_dataset = CreateDataset.SegmentationDataset( os.path.join(data_path, "samples"), num_val_samples, "val", transform=transforms.Compose([aug.ToTensorTarget()])) else: trn_dataset = CreateDataset.SegmentationDataset( 'samples', num_trn_samples, "trn", transform=transforms.Compose([ aug.RandomRotationTarget(), aug.HorizontalFlip(), aug.ToTensorTarget() ])) val_dataset = CreateDataset.SegmentationDataset( "samples", num_val_samples, "val", transform=transforms.Compose([aug.ToTensorTarget()])) # Shuffle must be set to True. trn_dataloader = DataLoader(trn_dataset, batch_size=batch_size, num_workers=4, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, shuffle=True) now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") for epoch in range(0, num_epochs): print() print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 20) trn_report = train(trn_dataloader, model, criterion, optimizer, lr_scheduler, num_classes, batch_size, classifier, epoch, progress_log) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = validation(val_dataloader, model, criterion, num_classes, batch_size, classifier, epoch, progress_log, batch_metrics) val_loss = val_report['loss'].avg if batch_metrics is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: print("save checkpoint") filename = os.path.join(output_path, 'checkpoint.pth.tar') best_loss = val_loss save_checkpoint( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: if bucket_output_path: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') else: bucket_filename = 'checkpoint.pth.tar' bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, batch_metrics) cur_elapsed = time.time() - since print('Current elapsed time {:.0f}m {:.0f}s'.format( cur_elapsed // 60, cur_elapsed % 60)) filename = os.path.join(output_path, 'last_epoch.pth.tar') save_checkpoint( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: if bucket_output_path: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) else: bucket_filename = 'last_epoch.pth.tar' bucket.upload_file("output.txt", f"Logs/{now}_output.txt") bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def evaluation(eval_loader, model, criterion, num_classes, batch_size, task, ep_idx, progress_log, batch_metrics=None, dataset='val', num_devices=0): """ Evaluate the model and return the updated metrics :param eval_loader: data loader :param model: model to evaluate :param criterion: loss criterion :param num_classes: number of classes :param batch_size: number of samples to process simultaneously :param task: segmentation or classification :param ep_idx: epoch index (for hypertrainer log) :param progress_log: progress log file (for hypertrainer log) :param batch_metrics: (int) Metrics computed every (int) batches. If left blank, will not perform metrics. :param dataset: (str) 'val or 'tst' :param num_devices: (int) Number of GPU devices to use. :return: (dict) eval_metrics """ eval_metrics = create_metrics_dict(num_classes) model.eval() for index, data in enumerate(eval_loader): progress_log.open('a', buffering=1).write( tsv_line(ep_idx, dataset, index, len(eval_loader), time.time())) with torch.no_grad(): if task == 'classification': inputs, labels = data if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() outputs = model(inputs) outputs_flatten = outputs elif task == 'segmentation': if num_devices > 0: inputs = data['sat_img'].cuda() labels = flatten_labels(data['map_img']).cuda() else: inputs = data['sat_img'] labels = flatten_labels(data['map_img']) outputs = model(inputs) outputs_flatten = flatten_outputs(outputs, num_classes) loss = criterion(outputs_flatten, labels) eval_metrics['loss'].update(loss.item(), batch_size) if (dataset == 'val') and (batch_metrics is not None): # Compute metrics every n batches. Time consuming. if index % batch_metrics == 0: a, segmentation = torch.max(outputs_flatten, dim=1) eval_metrics = report_classification( segmentation, labels, batch_size, eval_metrics) elif dataset == 'tst': a, segmentation = torch.max(outputs_flatten, dim=1) eval_metrics = report_classification(segmentation, labels, batch_size, eval_metrics) print(f"{dataset} Loss: {eval_metrics['loss'].avg}") if batch_metrics is not None: print(f"{dataset} precision: {eval_metrics['precision'].avg}") print(f"{dataset} recall: {eval_metrics['recall'].avg}") print(f"{dataset} fscore: {eval_metrics['fscore'].avg}") return eval_metrics
def main(params): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. """ model, state_dict_path, model_name = net(params) bucket_name = params['global']['bucket_name'] output_path = params['training']['output_path'] data_path = params['global']['data_path'] task = params['global']['task'] num_classes = params['global']['num_classes'] batch_size = params['training']['batch_size'] if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes, task=task) elif not bucket_name and task == 'classification': get_local_classes(num_classes, data_path, output_path) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): # Add header progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') model, criterion, optimizer, lr_scheduler, num_devices = set_hyperparameters( params, model, state_dict_path) num_samples = get_num_samples(data_path=data_path, params=params) print(f"Number of samples : {num_samples}") trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( data_path=data_path, num_samples=num_samples, batch_size=batch_size, task=task) now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): print() print('Epoch {}/{}'.format(epoch, params['training']['num_epochs'] - 1)) print('-' * 20) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, num_devices=num_devices) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', num_devices=num_devices) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: print("save checkpoint") best_loss = val_loss torch.save( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print('Current elapsed time {:.0f}m {:.0f}s'.format( cur_elapsed // 60, cur_elapsed % 60)) # load checkpoint model and evaluate it on test dataset. model = load_from_checkpoint(filename, model) tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=params['training']['num_epochs'], progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='tst', num_devices=num_devices) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))