def run_srgan(device, image_size, batch_size, config, run_dir, saved_dir, run_name, num_epochs, val_dataset, train_dataset=None, checkpoints=None, mode='train', gpu_num=1): srgan_generator = SRGenerator(device).to(device) srgan_discriminator = SRDiscriminator().to(device) summary(srgan_discriminator, (3, image_size, image_size)) summary(srgan_generator, (3, image_size // 4, image_size // 4)) if checkpoints is not None: utils.load_from_checkpoint(srgan_generator, saved_dir, checkpoints["generator"]) utils.load_from_checkpoint(srgan_discriminator, saved_dir, checkpoints["discriminator"]) run_name = 'SRGAN' + '_' + run_name if mode == 'train': inception_FID_scores = train_gan(num_epochs, batch_size, None, device, train_dataset, val_dataset, srgan_generator, srgan_discriminator, type='SRGAN', config=config, run_dir=run_dir, saved_dir=saved_dir, run_name=run_name, calc_IS=False) elif mode == 'test': inception_FID_scores = [ calc_inception_FID_score(batch_size, device, val_dataset, srgan_generator, type='SRGAN') ] date_str = datetime.datetime.now().strftime("%m%d%Y%H") utils.save_to_pickle( inception_FID_scores, os.path.join(saved_dir, 'srgan_fid_' + date_str + ".pickle")) return inception_FID_scores
def ternausnet(num_classes, state_dict_path): """ pretrained: False - no pre-trained network is used True - encoder is pre-trained with VGG11 carvana - all weights are pre-trained on Kaggle: Carvana dataset https://www.kaggle.com/c/carvana-image-masking-challenge """ model = UNet11(num_classes) model = load_from_checkpoint(state_dict_path, model) return model
def main(params): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() csv_file = params['inference']['img_csv_file'] bucket = None bucket_name = params['global']['bucket_name'] model, state_dict_path, model_name = net(params, inference=True) if torch.cuda.is_available(): model = model.cuda() if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) bucket.download_file(csv_file, 'img_csv_file.csv') list_img = read_csv('img_csv_file.csv', inference=True) else: list_img = read_csv(csv_file, inference=True) if params['global']['task'] == 'classification': classifier(params, list_img, model) elif params['global']['task'] == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") model = load_from_checkpoint("saved_model.pth.tar", model) else: model = load_from_checkpoint(state_dict_path, model) chunk_size, nbr_pix_overlap = calc_overlap(params) num_classes = params['global']['num_classes'] for img in list_img: img_name = os.path.basename(img['tif']) if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" else: local_img = img['tif'] inference_image = os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif") assert_band_number(local_img, params['global']['number_of_bands']) nd_array_tif = image_reader_as_array(local_img) sem_seg_results = sem_seg_inference(model, nd_array_tif, nbr_pix_overlap, chunk_size, num_classes) create_new_raster_from_base(local_img, inference_image, sem_seg_results) print(f"Semantic segmentation of image {img_name} completed") if bucket: bucket.upload_file( inference_image, os.path.join(params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}" ) time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(bucket_name, data_path, output_path, num_trn_samples, num_val_samples, pretrained, batch_size, num_epochs, learning_rate, weight_decay, step_size, gamma, num_classes, class_weights, batch_metrics, model, classifier, model_name): """Function to train and validate a models for semantic segmentation. Args: bucket_name: bucket in which data is stored if using AWS S3 data_path: full file path of the folder containing h5py files output_path: full file path in which the model will be saved num_trn_samples: number of training samples num_val_samples: number of validation samples pretrained: booleam indicating if the model is pretrained batch_size: number of samples to process simultaneously num_epochs: number of epochs learning_rate: learning rate weight_decay: weight decay step_size: step size gamma: multiplicative factor of learning rate decay num_classes: number of classes class_weights: weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class batch_metrics:(int) Metrics computed every (int) batches. If left blank, will not perform metrics. model: CNN model (tensor) classifier: True if doing image classification, False if doing semantic segmentation. model_name: name of the model used for training. Returns: Files 'checkpoint.pth.tar' and 'last_epoch.pth.tar' containing trained weight """ if bucket_name: if output_path is None: bucket_output_path = None else: bucket_output_path = output_path output_path = 'output_path' try: os.mkdir(output_path) except FileExistsError: pass s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) if classifier: for i in ['trn', 'val']: get_s3_classification_images(i, bucket, bucket_name, data_path, output_path, num_classes) class_file = os.path.join(output_path, 'classes.csv') if bucket_output_path: bucket.upload_file( class_file, os.path.join(bucket_output_path, 'classes.csv')) else: bucket.upload_file(class_file, 'classes.csv') data_path = 'Images' else: if data_path: bucket.download_file( os.path.join(data_path, 'samples/trn_samples.hdf5'), 'samples/trn_samples.hdf5') bucket.download_file( os.path.join(data_path, 'samples/val_samples.hdf5'), 'samples/val_samples.hdf5') else: bucket.download_file('samples/trn_samples.hdf5', 'samples/trn_samples.hdf5') bucket.download_file('samples/val_samples.hdf5', 'samples/val_samples.hdf5') verify_sample_count(num_trn_samples, num_val_samples, data_path, bucket_name) elif classifier: get_local_classes(num_classes, data_path, output_path) else: verify_sample_count(num_trn_samples, num_val_samples, data_path, bucket_name) verify_weights(num_classes, class_weights) since = time.time() best_loss = 999 trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') if torch.cuda.is_available(): model = model.cuda() if class_weights: criterion = nn.CrossEntropyLoss( weight=torch.tensor(class_weights)).cuda() else: criterion = nn.CrossEntropyLoss().cuda() else: if class_weights: criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights)) else: criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=step_size, gamma=gamma) # learning rate decay if pretrained != '': model, optimizer = load_from_checkpoint(pretrained, model, optimizer) if classifier: trn_dataset = torchvision.datasets.ImageFolder( os.path.join(data_path, "trn"), transform=transforms.Compose([ transforms.RandomRotation((0, 275)), transforms.RandomHorizontalFlip(), transforms.Resize(299), transforms.ToTensor() ]), loader=loader) val_dataset = torchvision.datasets.ImageFolder( os.path.join(data_path, "val"), transform=transforms.Compose( [transforms.Resize(299), transforms.ToTensor()]), loader=loader) else: if not bucket_name: trn_dataset = CreateDataset.SegmentationDataset( os.path.join(data_path, "samples"), num_trn_samples, "trn", transform=transforms.Compose([ aug.RandomRotationTarget(), aug.HorizontalFlip(), aug.ToTensorTarget() ])) val_dataset = CreateDataset.SegmentationDataset( os.path.join(data_path, "samples"), num_val_samples, "val", transform=transforms.Compose([aug.ToTensorTarget()])) else: trn_dataset = CreateDataset.SegmentationDataset( 'samples', num_trn_samples, "trn", transform=transforms.Compose([ aug.RandomRotationTarget(), aug.HorizontalFlip(), aug.ToTensorTarget() ])) val_dataset = CreateDataset.SegmentationDataset( "samples", num_val_samples, "val", transform=transforms.Compose([aug.ToTensorTarget()])) # Shuffle must be set to True. trn_dataloader = DataLoader(trn_dataset, batch_size=batch_size, num_workers=4, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, shuffle=True) now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") for epoch in range(0, num_epochs): print() print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 20) trn_report = train(trn_dataloader, model, criterion, optimizer, lr_scheduler, num_classes, batch_size, classifier) trn_log.add_values(trn_report, epoch) val_report = validation(val_dataloader, model, criterion, num_classes, batch_size, classifier, batch_metrics) val_loss = val_report['loss'].avg if batch_metrics is not None: val_log.add_values(val_report, epoch, log_metrics=True) else: val_log.add_values(val_report, epoch) if val_loss < best_loss: print("save checkpoint") filename = os.path.join(output_path, 'checkpoint.pth.tar') best_loss = val_loss save_checkpoint( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: if bucket_output_path: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') else: bucket_filename = 'checkpoint.pth.tar' bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, batch_metrics) cur_elapsed = time.time() - since print('Current elapsed time {:.0f}m {:.0f}s'.format( cur_elapsed // 60, cur_elapsed % 60)) filename = os.path.join(output_path, 'last_epoch.pth.tar') save_checkpoint( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: if bucket_output_path: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) else: bucket_filename = 'last_epoch.pth.tar' bucket.upload_file("output.txt", f"Logs/{now}_output.txt") bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(bucket, work_folder, img_list, weights_file_name, model, number_of_bands, overlay, classify, num_classes): """Identify the class to which each image belongs. Args: bucket: bucket in which data is stored if using AWS S3 work_folder: full file path of the folder containing images img_list: list containing images to classify weights_file_name: full file path of the file containing weights model: loaded model with which inference should be done number_of_bands: number of bands in the input rasters overlay: amount of overlay to apply classify: True if doing a classification task, False if doing semantic segmentation """ if torch.cuda.is_available(): model = model.cuda() if bucket: bucket.download_file(weights_file_name, "saved_model.pth.tar") model = load_from_checkpoint("saved_model.pth.tar", model) if classify: classes_file = weights_file_name.split('/')[:-1] class_csv = '' for folder in classes_file: class_csv = os.path.join(class_csv, folder) bucket.download_file(os.path.join(class_csv, 'classes.csv'), 'classes.csv') with open('classes.csv', 'rt') as file: reader = csv.reader(file) classes = list(reader) else: model = load_from_checkpoint(weights_file_name, model) if classify: classes_file = weights_file_name.split('/')[:-1] class_path = '' for c in classes_file: class_path = class_path + c + '/' with open(class_path + 'classes.csv', 'rt') as f: reader = csv.reader(f) classes = list(reader) since = time.time() classified_results = np.empty((0, 2 + num_classes)) for img in img_list: img_name = os.path.basename(img['tif']) if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" else: local_img = img['tif'] inference_image = os.path.join( work_folder, f"{img_name.split('.')[0]}_inference.tif") assert_band_number(local_img, number_of_bands) if classify: outputs, predicted = classifier(bucket, model, img['tif']) top5 = heapq.nlargest(5, outputs.cpu().numpy()[0]) top5_loc = [] for i in top5: top5_loc.append(np.where(outputs.cpu().numpy()[0] == i)[0][0]) print(f"Image {img_name} classified as {classes[0][predicted]}") print('Top 5 classes:') for i in range(0, 5): print(f"\t{classes[0][top5_loc[i]]} : {top5[i]}") classified_results = np.append(classified_results, [ np.append([img['tif'], classes[0][predicted]], outputs.cpu().numpy()[0]) ], axis=0) print() else: sem_seg_results = sem_seg_inference(bucket, model, img['tif'], overlay) create_new_raster_from_base(local_img, inference_image, sem_seg_results) print(f"Semantic segmentation of image {img_name} completed") if bucket: if not classify: bucket.upload_file( inference_image, os.path.join(work_folder, f"{img_name.split('.')[0]}_inference.tif")) if classify: csv_results = 'classification_results.csv' if bucket: np.savetxt(csv_results, classified_results, fmt='%s', delimiter=',') bucket.upload_file(csv_results, os.path.join(work_folder, csv_results)) else: np.savetxt(os.path.join(work_folder, csv_results), classified_results, fmt='%s', delimiter=',') time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
action='store', dest='top_k', default=5, help='the number of probabilities to show') parser.add_argument('--category_names', action='store', dest='category_names', default=None, help='the label names') parser.add_argument('--gpu', action='store', dest='gpu', default=True, help='the model architecture') results = parser.parse_args() image_path = results.input checkpoint = results.checkpoint top_k = int(results.top_k) category_names_json = results.category_names gpu = results.gpu category_names = None if category_names_json == None: print("No label to index mapping provided") else: with open(str(category_names_json), 'r') as f: category_names = json.load(f) model = utils.load_from_checkpoint(checkpoint) pil_image = Image.open(image_path, 'r') predict(pil_image, model, gpu, category_names, topk=top_k)
def run_dcgan(device, image_size, noise_size, batch_size, config, run_dir, saved_dir, run_name, num_epochs, val_dataset, train_dataset=None, checkpoints=None, mode='train', gpu_num=1): #Run DCGAN type = 'DCGAN' dcgan_generator = Generator(noise_size=noise_size, image_size=image_size).to(device) dcgan_discriminator = Discriminator(image_size=image_size).to(device) #Parallel for improved performence if device.type == 'cuda' and gpu_num > 1: dcgan_generator = nn.DataParallel(dcgan_generator, list(range(gpu_num))) dcgan_discriminator = nn.DataParallel(dcgan_discriminator, list(range(gpu_num))) #Print networks print('Discriminator') summary(dcgan_discriminator, (3, image_size, image_size)) print('Generator') summary(dcgan_generator, (noise_size, 1, 1)) if checkpoints is not None: utils.load_from_checkpoint(dcgan_generator, saved_dir, checkpoints["generator"]) utils.load_from_checkpoint(dcgan_discriminator, saved_dir, checkpoints["discriminator"]) run_name = 'DCGAN' + '_' + run_name #We train the model in train phase and only calculate scores in test mode if mode == 'train': inception_FID_scores, inception_scores = train_gan(num_epochs, batch_size, noise_size, device, train_dataset, val_dataset, dcgan_generator, dcgan_discriminator, type='DCGAN', config=config, run_dir=run_dir, saved_dir=saved_dir, run_name=run_name) elif mode == 'test': inception_FID_scores = [ calc_inception_FID_score(batch_size, device, val_dataset, dcgan_generator, type, noise_size) ] inception_scores = [ calc_inception_score(device, noise_size, dcgan_generator, eval_size=len(val_dataset)) ] #Return list of all score accumulated in epochs date_str = datetime.datetime.now().strftime("%m%d%Y%H") save_to_pickle( inception_FID_scores, os.path.join(saved_dir, 'dcgan_fid_' + run_name + date_str + ".pickle")) save_to_pickle( inception_scores, os.path.join(saved_dir, 'dcgan_IS_' + run_name + date_str + ".pickle")) return inception_FID_scores, inception_scores
def run_sagan(device, image_size, noise_size, batch_size, config, run_dir, saved_dir, run_name, num_epochs, val_dataset, train_dataset=None, checkpoints=None, mode='train', gpu_num=1): type = 'SAGAN' sagan_generator = SAGenerator(noise_size=noise_size, image_size=image_size).to(device) sagan_discriminator = SADiscriminator(image_size=image_size).to(device) # Parallel for improved performance if ((device.type == 'cuda') and (gpu_num > 1)): sagan_generator = nn.DataParallel(sagan_generator, list(range(gpu_num))) sagan_discriminator = nn.DataParallel(sagan_discriminator, list(range(gpu_num))) # Print networks print('Discriminator') summary(sagan_discriminator, (3, image_size, image_size)) print('Generator') summary(sagan_generator, (noise_size, 1, 1)) if checkpoints is not None: utils.load_from_checkpoint(sagan_generator, saved_dir, checkpoints["generator"]) utils.load_from_checkpoint(sagan_discriminator, saved_dir, checkpoints["discriminator"]) run_name = 'SAGAN' + '_' + run_name if mode == 'train': inception_FID_scores, inception_scores = train_gan(num_epochs, batch_size, noise_size, device, train_dataset, val_dataset, sagan_generator, sagan_discriminator, type='SAGAN', config=config, run_dir=run_dir, saved_dir=saved_dir, run_name=run_name) elif mode == 'test': inception_FID_scores = [ calc_inception_FID_score(batch_size, device, val_dataset, sagan_generator, type, noise_size) ] inception_scores = [ calc_inception_score(device, noise_size, sagan_generator, eval_size=len(val_dataset)) ] date_str = datetime.datetime.now().strftime("%m%d%Y%H") utils.save_to_pickle( inception_FID_scores, os.path.join(saved_dir, 'sagan_fid_' + run_name + date_str + ".pickle")) utils.save_to_pickle( inception_scores, os.path.join(saved_dir, 'sagan_IS_' + run_name + date_str + ".pickle")) return inception_FID_scores, inception_scores
def main(options=None): args = get_args() if options is not None: args = utils.load_options(args, options) seed = 1 # Do NOT modify the seed. The captions have been generated from images generated from this seed. torch.cuda.manual_seed_all(seed) torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # -------------------------------- INSTANTIATE MAIN ACTORS ----------------------------- # # --------------- Create dataset ---------------- # print('Creating dataset', flush=True) image_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.Resize(128), # Smaller edge will be matched to this number transforms.CenterCrop((128, 128)), transforms.ToTensor(), image_normalize, ]) train_dataset = dataset.ImageAudioDataset(args.folder_dataset + args.name_dataset, split='train', random_sampling=True, transform=transform, loading_image=args.loading_image) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, sampler=None) val_dataset = dataset.ImageAudioDataset(args.folder_dataset + args.name_dataset, split='val', transform=transform, loading_image=args.loading_image) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) test_dataset = dataset.ImageAudioDataset(args.folder_dataset + args.name_dataset, split='test', transform=transform, loading_image=args.loading_image) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # -------------- Create model --------------- # print('Creating model', flush=True) module = __import__('models') model_class = getattr(module, args.model) model = model_class(args) model = torch.nn.DataParallel(model).cuda() # Print model information utils.print_model_report(model) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Load model resume_epoch = 0 if args.seed: if args.seed == 'EXPDIR': if args.name_checkpoint == '': name = args.model + '_' + args.name_dataset else: name = args.name_checkpoint path_load = args.expdir + 'model_best_' + name + '.pth.tar' else: path_load = args.seed if args.resume: utils.load_from_checkpoint(model, path_load, submodels_load=args.submodels_load, optimizer=None) checkpoint = torch.load(path_load) resume_epoch = checkpoint['epoch'] else: utils.load_from_checkpoint(model, path_load, submodels_load=args.submodels_load, optimizer=None) # --------------- Instantiate trainer --------------- # print('Instantiating trainer', flush=True) all_loaders = { 'val': val_loader, 'train': train_loader, 'test': test_loader } trainer = Trainer(model, optimizer, all_loaders, args, resume_epoch=resume_epoch) # ------------------------- Others ----------------------- # current_time = datetime.now().strftime('%b%d_%H-%M-%S') log_dir = os.path.join( args.results, 'runs', args.name_checkpoint + '_' + current_time + '_' + socket.gethostname()) args.writer = SummaryWriter(log_dir=log_dir) # ----------------------------------- TRAIN ------------------------------------------ # if args.experiment: print("Running experiment", flush=True) experiments.experiment(args.experiment_name, trainer) elif args.evaluate: print("Performing evaluation epoch", flush=True) trainer.eval() elif args.generate_active_learning: print("Generating active learning samples", flush=True) active_learning.generate_active_learning(trainer) else: print("Beginning training", flush=True) trainer.train()
def main(params): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. """ model, state_dict_path, model_name = net(params) bucket_name = params['global']['bucket_name'] output_path = params['training']['output_path'] data_path = params['global']['data_path'] task = params['global']['task'] num_classes = params['global']['num_classes'] batch_size = params['training']['batch_size'] if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes, task=task) elif not bucket_name and task == 'classification': get_local_classes(num_classes, data_path, output_path) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): # Add header progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') model, criterion, optimizer, lr_scheduler, num_devices = set_hyperparameters( params, model, state_dict_path) num_samples = get_num_samples(data_path=data_path, params=params) print(f"Number of samples : {num_samples}") trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( data_path=data_path, num_samples=num_samples, batch_size=batch_size, task=task) now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): print() print('Epoch {}/{}'.format(epoch, params['training']['num_epochs'] - 1)) print('-' * 20) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, num_devices=num_devices) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', num_devices=num_devices) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: print("save checkpoint") best_loss = val_loss torch.save( { 'epoch': epoch, 'arch': model_name, 'model': model.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print('Current elapsed time {:.0f}m {:.0f}s'.format( cur_elapsed // 60, cur_elapsed % 60)) # load checkpoint model and evaluate it on test dataset. model = load_from_checkpoint(filename, model) tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=params['training']['num_epochs'], progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='tst', num_devices=num_devices) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def set_hyperparameters(params, model, state_dict_path): """ Function to set hyperparameters based on values provided in yaml config file. Will also set model to GPU, if available. If none provided, default functions values are used. :param params: (dict) Parameters found in the yaml config file :param model: Model loaded from model_choice.py :param state_dict_path: (str) Full file path to the state dict :return: model, criterion, optimizer, lr_scheduler, num_gpus """ loss_signature = inspect.signature(nn.CrossEntropyLoss).parameters adam_signature = inspect.signature(optim.Adam).parameters lr_scheduler_signature = inspect.signature( optim.lr_scheduler.StepLR).parameters class_weights = loss_signature['weight'].default ignore_index = loss_signature['ignore_index'].default lr = adam_signature['lr'].default weight_decay = adam_signature['weight_decay'].default step_size = lr_scheduler_signature['step_size'].default if not isinstance(step_size, int): step_size = params['training']['num_epochs'] + 1 gamma = lr_scheduler_signature['gamma'].default num_devices = 0 if params['training']['class_weights'] is not None: class_weights = torch.tensor(params['training']['class_weights']) verify_weights(params['global']['num_classes'], class_weights) if params['training']['ignore_index'] is not None: ignore_index = params['training']['ignore_index'] if params['training']['learning_rate'] is not None: lr = params['training']['learning_rate'] if params['training']['weight_decay'] is not None: weight_decay = params['training']['weight_decay'] if params['training']['step_size'] is not None: step_size = params['training']['step_size'] if params['training']['gamma'] is not None: gamma = params['training']['gamma'] if params['global']['num_gpus'] is not None: num_devices = params['global']['num_gpus'] if torch.cuda.is_available(): lst_device_ids = get_device_ids(num_devices) else: lst_device_ids = [] if lst_device_ids: if len(lst_device_ids) == 1: print(f"Using Cuda device {lst_device_ids[0]}") torch.cuda.set_device(lst_device_ids[0]) model = model.cuda() criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=ignore_index).cuda() num_devices = len(lst_device_ids) if len(lst_device_ids) > 1: print( f"Using data parallel on devices {str(lst_device_ids)[1:-1]}") model = nn.DataParallel(model, device_ids=lst_device_ids) else: warnings.warn( f"No Cuda device available. This process will only run on CPU") num_devices = 0 criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=ignore_index) optimizer = optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) if state_dict_path != '': model, optimizer = load_from_checkpoint(state_dict_path, model, optimizer=True) return model, criterion, optimizer, lr_scheduler, num_devices