def seg_learner(tiny_seg_databunch, seg_classes): return unet_learner( tiny_seg_databunch, models.resnet18, wd=1e-2, metrics=get_objective_fct(seg_classes), )
def create_model_from_zip(self, weights_fn): """Creates a deep learning model linked to the dataset and stores it as an instance attribute. """ weights_fn = weights_fn.resolve() logger.info( f"Unzipping the model weights and label classes from {weights_fn}") output_dir = self.root_dir / "extracted_model_files" output_dir.mkdir(exist_ok=True) with ZipFile(weights_fn, mode='r') as zf: zf.extractall(output_dir) # Load in the label classes from the json file with open(output_dir / f"{weights_fn.stem}_codes.json") as jf: self.codes = json.load(jf) # Have to create dummy files and datset before loading in model weights self.create_dummy_files() self.create_dummy_dataset() logger.info("Creating 2d U-net model for prediction.") self.model = unet_learner(self.data, models.resnet34, model_dir=output_dir) logger.info("Loading in the saved weights.") self.model.load(weights_fn.stem) # Remove the restriction on the model prediction size self.model.data.single_ds.tfmargs['size'] = None
def seg_learner(tiny_seg_databunch, seg_classes): return unet_learner( tiny_seg_databunch, models.resnet18, wd=1e-2, metrics=get_ratio_correct_metric(seg_classes), )
def create_learner_gen(data): return fv.unet_learner(data, fv.models.resnet34, blur=True, norm_type=fai.NormType.Weight, self_attention=True, y_range=(-3., 3.), loss_func=l1_loss_flat, metrics=[l1_loss_flat], wd=1e-2)
def train_and_eval(): print("Creating databunch and learner...") data = create_databunch(cf.PATH_IMG, cf.PATH_LBL, cf.CODES, cf.INPUT_SIZE, cf.BATCH_SIZE) learner = unet_learner(data, models.resnet34, metrics=dice) print("Training model...") train_model(learner, cf.FREEZE_LAYER, cf.EPOCHS, cf.LEARNING_RATE, cf.WEIGHT_DECAY, cf.SAVE_MODEL) print("Evaluating model...") eval = eval_model(cf.PATH_TO_TESTING, cf.CODES, cf.INPUT_SIZE, cf.BATCH_SIZE, learner) print(f'Loss = {eval[0]}, Accuracy = {eval[1]}') print("You have successfully trained and evaluated your model!" "Please find it in the appropriate directory.")
def get_learner(data, metrics=None, model_dir='models'): if metrics is None: metrics = [] learner = unet_learner( data, models.resnet34, metrics=metrics, self_attention=True, loss_func=generalized_dice_loss, wd=1e-7, model_dir=model_dir, ) if USE_GPU: return learner.to_fp16() return learner
def create_model(self): """Creates a deep learning model linked to the dataset and stores it as an instance attribute. """ logger.info("Creating 2d U-net model for training.") self.model = unet_learner(self.data, models.resnet34, metrics=self.metrics, wd=self.weight_decay, loss_func=self.loss_func, callback_fns=[ partial(CSVLogger, filename='unet_training_history', append=True), partial(SaveModelCallback, monitor=self.monitor, mode='max', name="best_unet_model") ])
def get_learn(data, model, name, weighted, cut): """TODO""" metrics = get_metrics() learn = unet_learner( data, model, split_on=_resnet_split, cut=cut, metrics=metrics, path="models", model_dir=name, wd=1e-2, ) if weighted: weights = get_loss_weights(data, learn) learn.loss_fn = CrossEntropyFlat(weight=Tensor(weights).cuda(), ignore_index=0) else: learn.loss_fn = CrossEntropyFlat(ignore_index=0) learn = learn.to_fp16() return learn
def init_model(self): """ Initialize learner with parameters set in constructor. """ defaults.device = torch.device(self.device) size = self.src_shape // self.train_params['resize_factor'] tfms = get_transforms(do_flip=True, flip_vert=False, max_lighting=0.8, p_affine=0, p_lighting=0.5) tfms = (tfms[0][1:], tfms[1]) get_label = partial(get_label_from_image_name, self.labels_path) if self.valid_func: src = (SegItemListCustom.from_folder( self.videos_path, ignore_empty=True, recurse=True).filter_by_func( self.filtered_by).split_by_valid_func( self.valid_func).label_from_func( get_label, classes=np.array(['non-screen', 'screen']))) else: src = (SegItemListCustom.from_folder( self.videos_path, ignore_empty=True, recurse=True).filter_by_func( self.filtered_by).split_none().label_from_func( get_label, classes=np.array(['non-screen', 'screen']))) LOGGER.info("Creating databunch with transformations") data = (src.transform(tfms, size=size, tfm_y=True).databunch( bs=self.train_params['batch_size']).normalize(imagenet_stats)) LOGGER.info("Creating unet-learner with resnet18 backbone.") self.learner = unet_learner(data, models.resnet18, metrics=[acc, dice, iou_sem_seg])
def bce_logits_floatify(input, target, reduction='mean'): return F.binary_cross_entropy_with_logits(input, target.float(), reduction=reduction) def dice_metric(pred, targs, threshold=0): pred = (pred > threshold).float() targs = targs.float() # make sure target is float too return 2.0 * (pred * targs).sum() / ((pred + targs).sum() + 1.0) if __name__ == '__main__': torch.cuda.empty_cache() data = CloudDataset('train') train, test = data.split(train_ratio=.8, test_ratio=.2) print(len(train), len(test)) data = DataBunch.create(train, train, bs=2, num_workers=1) learn = unet_learner( data=data, arch=models.resnet18, loss_func=bce_logits_floatify, metrics=[dice_metric] ) print(learn.model[0][0]) print(learn.model[-1][-1]) learn.fit(epochs=20)
print(data.show_batch(2, figsize=(10, 7), ds_type=vision.DatasetType.Valid)) # Define accuracy object2id = {value: key for key, value in enumerate(objects_in_image)} void_index = object2id['Void'] def camvid_accuracy(inputs, target): target = target.squeeze(1) mask = target != void_index return (inputs.argmax(dim=1)[mask] == target[mask]).float().mean() # Define model learner = vision.unet_learner(data, vision.models.resnet34, metrics=camvid_accuracy, wd=WD) # Find good LR learner.lr_find() learner.recorder.plot() learner.fit_one_cycle(EPOCHS_FINETUNE, max_lr=slice(LR), pct_start=PCT_START_FINETUNE) learner.save('stage-1-34-unet') # Show results learner.show_results(rows=3, figsize=(8, 9)) # After warming up, start to train all network learner.unfreeze() learner.fit_one_cycle(EPOCHS, max_lr=slice(LR / 400, LR / 4),
def train_unet(epochs=5, batch_size=1, lr=0.1, val_percent=0.1): print("Start script") if args.isgrid is False: filename = "/media/adrian/E2B45A26B459FD8B/psfmaskmoving_zernike2d_128_n_1_s_0_p_0_b_0__noise_1_2dzernike_test/" batch_size = int(batch_size // 1.5) else: filename = "/idiap/temp/ashajkofci/psfmaskmoving_zernike2d_128_n_1_s_0_p_0_b_0__noise_1_2dzernike_train/" batch_size = batch_size os.environ['TORCH_HOME'] = os.getcwd() + 'data' #transform = transforms.Compose([ # transforms.ToPILImage(), # transforms.RandomCrop([450, 450]), # transforms.RandomVerticalFlip(), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # ]) all_files_list = glob.glob(filename + "*/*.png") print('{} files found in {}'.format(len(all_files_list), filename)) all_files_list = [x for x in all_files_list if "_mask" not in x] print('{} files found'.format(len(all_files_list))) all_files_list = sorted(all_files_list, key=lambda name: int(name[-13:-4])) print('{} files found'.format(len(all_files_list))) #all_files_list = all_files_list[:100000] #all_labels_list = lambda x: str(x).replace('.png', '_mask.png') num_files = len(all_files_list) print('{} files found'.format(len(all_files_list))) print("Convert to Dataframe") #df = pd.DataFrame({'data':all_files_list, 'label':all_labels_list}) df = pd.DataFrame(all_files_list) print("Create transforms") print("Create data") #class MyImageList(ImageList): # def open(self, fn): # image = Image(grayloader(fn, onedim=True)) # return image src = (MyImageImageList.from_df(df, path='/').split_by_rand_pct(val_percent)) print("Creating dataloaders") data_gen = get_data(bs=batch_size, size=224, src=src) #dataset = DatasetFromFolder(filename, loader = grayloader, transform=transform, target_transform=transform) #n_val = int(len(dataset) * val_percent) #n_train = len(dataset) - n_val #train, val = rs(dataset, [n_train, n_val]) #data = ImageDataBunch.create(train, val, bs=batch_size, num_workers=4) #data.c = 2 #data.normalize(imagenet_stats) #data_gen.show_batch(2) #plt.show() print("Creating learner") #optar = partial(DiffGrad, version=1, betas=(.95, .999), eps=1e-6) optar = partial(Ranger, betas=(0.95, 0.99), eps=1e-6) selfattention = False modelname = 'resnet34unetanneal' learn = unet_learner(data_gen, model_resnet34, pretrained=True, self_attention=selfattention, norm_type=NormType.Weight, loss_func=loss_with_flag, y_range=(0., 1.0)) learn.model_dir = os.getcwd() + 'data' learn.opt_func = optar print("Summary...") dt_string = datetime.now().strftime("%d-%m-%Y-%H:%M:%S") #writer = SummaryWriter(comment=f'PROJ_{args.nbgrid}_LR_{lr}_BS_{batch_size}_FP_{args.fakepenalty}_N_{args.network}') name = f'{dt_string}_PROJ_{args.nbgrid}_LR_{lr}_BS_{batch_size}_N_{args.network}_ATT_{selfattention}_MODEL_{modelname}' mycallback = partial(TensorboardLogger, path='runs', run_name=name) learn.callback_fns.append(mycallback) learn.model.layers = learn.model.layers[:-1] print(learn.summary()) #learn.lr_find(stop_div = False, num_it=200) #learn.recorder.plot(suggestion=True) #plt.show() flattenAnneal(learn, lr, epochs, 0.7) #learn.fit_one_cycle(epochs, max_lr = lr) torch.save(learn.model, os.getcwd() + '/data/' + name + '_TORCH_INTERMEDIATE.pth') learn.export(os.getcwd() + '/data/' + name + '_INTERMEDIATE_EXPORT.pth') #learn.fit_one_cycle(epochs, max_lr=lr/5.0) learn.unfreeze() flattenAnneal(learn, lr / 5, epochs, 0.7) mycallback = partial(TensorboardLogger, path='runs', run_name=name + '_UNFREEZE') learn.callback_fns[-1] = mycallback torch.save(learn.model, os.getcwd() + '/data/' + name + '_TORCH.pth') learn.export(os.getcwd() + '/data/' + name + '_EXPORT.pth')
def train_unet(epochs=5, batch_size=1, lr=0.1, val_percent=0.1): print("Start script") if args.isgrid is False: filename = "/media/adrian/OMENDATA/data/movementgenerator_data_multiple2/" batch_size = int(batch_size//1.5) else: filename = "/idiap/temp/ashajkofci/movementgenerator_data_multiple2/" batch_size = batch_size os.environ['TORCH_HOME'] = os.getcwd()+'/data' all_files_list = glob.glob(filename + "*/*.png") print('{} files found in {}'.format(len(all_files_list), filename)) all_files_list = sorted(all_files_list, key=lambda name: int(name[-13:-4])) print('{} files found'.format(len(all_files_list))) print("Convert to Dataframe") df = pd.DataFrame(all_files_list) print("Create transforms") print("Create data") class MyImageList(ImageList): def open(self, fn): image = np.load(fn)['arr_0'] image = np.transpose(image, (2, 0, 1)) image[1] /= 128.0 image[0] /= 128.0 image[3] /= 5.0 image = torch.Tensor(image) #print('{} {} {}'.format(image.min(), image.max(), image.mean())) image = Image(image) return image class MyImageImageList(ImageImageList): _label_cls = MyImageList def open(self, fn): return Image(grayloader(fn)) def get_data(bs, size): data = (src.label_from_func(lambda x: str(x).replace('.png', '_mask.npy.npz')) .transform(get_transforms(do_flip = False, max_zoom=1.0, max_warp=0.0, max_rotate=0, max_lighting=0.3), tfm_y=False) .transform([rand_crop(), rand_crop()], tfm_y=True, size= size) .databunch(bs=bs).normalize(imagenet_stats, do_y=False)) data.c = 4 return data src = (MyImageImageList.from_df(df, path='/') .split_by_rand_pct(val_percent)) print("Creating dataloaders") data_gen = get_data(bs=batch_size, size=448) #dataset = DatasetFromFolder(filename, loader = grayloader, transform=transform, target_transform=transform) #n_val = int(len(dataset) * val_percent) #n_train = len(dataset) - n_val #train, val = rs(dataset, [n_train, n_val]) #data = ImageDataBunch.create(train, val, bs=batch_size, num_workers=4) #data.c = 2 #data.normalize(imagenet_stats) #data_gen.show_batch(2) #plt.show() print("Creating learner") optar = partial(DiffGrad, version=1, betas=(.95, .999), eps=1e-6) selfattention=False modelname='resnet34' learn = unet_learner(data_gen, model_resnet34, pretrained=True, loss_func = MSELossFlat(), self_attention=False) learn.model_dir = os.getcwd()+'/data' learn.opt_func = optar print("Summary...") dt_string = datetime.now().strftime("%d-%m-%Y-%H:%M:%S") name =f'{dt_string}_PROJ_{args.nbgrid}_LR_{lr}_BS_{batch_size}_N_{args.network}_ATT_{selfattention}_MODEL_{modelname}' mycallback = partial(TensorboardLogger, path='runs', run_name=name, run_type='unet') learn.callback_fns.append(mycallback) learn.callback_fns.append(partial(SaveModelCallback,every='improvement', name='{}/{}.pth'.format(dir_checkpoint, name))) #learn.model.layers = learn.model.layers[:-1] print(learn.summary()) #learn.lr_find(stop_div = False, num_it=200) #learn.recorder.plot(suggestion=True) #plt.show() learn.fit_one_cycle(epochs, max_lr = lr) torch.save(learn.model, 'data/'+name+'_TORCH_INTERMEDIATE.pth') learn.unfreeze() learn.fit_one_cycle(epochs, max_lr = slice(lr/50,lr/5)) learn.save(name+'_FINAL') torch.save(learn.model, 'data/'+name+'_TORCH.pth')
def train(self, tmp_dir): """Train a model.""" self.print_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes num_workers = 0 if self.train_opts.debug else 4 train_img_dir = self.subset_training_data(chip_dir) def get_data(train_sampler=None): data = (SegmentationItemList.from_folder(chip_dir).split_by_folder( train=train_img_dir, valid='val-img').label_from_func( get_label_path, classes=classes).transform( get_transforms(flip_vert=self.train_opts.flip_vert), size=size, tfm_y=True).databunch(bs=self.train_opts.batch_sz, num_workers=num_workers, train_sampler=train_sampler)) return data data = get_data() oversample = self.train_opts.oversample if oversample: sampler = get_weighted_sampler(data.train_ds, oversample['rare_class_ids'], oversample['rare_target_prop']) data = get_data(train_sampler=sampler) if self.train_opts.debug: make_debug_chips(data, class_map, tmp_dir, train_uri) # Setup learner. ignore_idx = 0 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx) ] model_arch = getattr(models, self.train_opts.model_arch) learn = unet_learner(data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, bottle=True, path=train_dir) learn.unfreeze() if self.train_opts.fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict(torch.load( pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)
def _learn(self, data_path: Path, params: Tuple[Any], stop_early: bool, learner_type="cnn") -> Tuple[Learner, Time]: """ Given a set of permutations, create a learner to train and validate on the dataset. Args: data_path (Path): The location of the data to use params (Tuple[Any]): The set of parameters to train and validate on stop_early (bool): Whether or not to stop early if the evaluation metric does not improve Returns: Tuple[Learner, Time]: Learn object from Fastai and the duration in seconds it took. """ start = time.time() params = self._param_tuple_to_dict(params) transform = params["transform"] im_size = params["im_size"] epochs = params["epochs"] batch_size = params["batch_size"] architecture = params["architecture"] dropout = params["dropout"] learning_rate = params["learning_rate"] discriminative_lr = params["discriminative_lr"] training_schedule = params["training_schedule"] one_cycle_policy = params["one_cycle_policy"] weight_decay = params["weight_decay"] callbacks = list() if stop_early: callbacks.append(ParameterSweeper._early_stopping_callback()) # Initialize CNN learner if learner_type == "cnn": data = self._get_data_bunch_imagelist(data_path, transform, im_size, batch_size) learn = cnn_learner( data, architecture.value, metrics=accuracy, ps=dropout, callback_fns=callbacks, ) # Initialize UNet learner elif learner_type == "unet": classes = read_classes(os.path.join(data_path, "classes.txt")) data = self._get_data_bunch_segmentationitemlist( data_path, transform, im_size, batch_size, classes) metric = get_objective_fct(classes) metric.__name__ = "ratio_correct" learn = unet_learner( data, architecture.value, wd=1e-2, metrics=metric, callback_fns=callbacks, ) else: print(f"Mode learner_type={learner_type} not supported.") head_learning_rate = learning_rate body_learning_rate = (slice(learning_rate, 3e-3) if discriminative_lr else learning_rate) def fit(learn: Learner, e: int, lr: Union[slice, float], wd=float) -> partial: """ Returns a partial func for either fit_one_cycle or fit depending on <one_cycle_policy> """ return (partial(learn.fit_one_cycle, cyc_len=e, max_lr=lr, wd=wd) if one_cycle_policy else partial( learn.fit, epochs=e, lr=lr, wd=wd)) if training_schedule is TrainingSchedule.head_only: if discriminative_lr: raise Exception( "Cannot run discriminative_lr if training schedule is head_only." ) else: fit(learn, epochs, body_learning_rate, weight_decay)() elif training_schedule is TrainingSchedule.body_only: learn.unfreeze() fit(learn, epochs, body_learning_rate, weight_decay)() elif training_schedule is TrainingSchedule.head_first_then_body: head_epochs = epochs // 4 fit(learn, head_epochs, head_learning_rate, weight_decay)() learn.unfreeze() fit(learn, epochs - head_epochs, body_learning_rate, weight_decay)() end = time.time() duration = end - start return learn, duration
def train(self, tmp_dir): """Train a model.""" # Setup hyperparams. bs = int(self.config.get('bs', 8)) wd = self.config.get('wd', 1e-2) lr = self.config.get('lr', 2e-3) num_epochs = int(self.config.get('num_epochs', 10)) model_arch = self.config.get('model_arch', 'resnet50') model_arch = getattr(models, model_arch) fp16 = self.config.get('fp16', False) sync_interval = self.config.get('sync_interval', 1) debug = self.config.get('debug', False) chip_uri = self.config['chip_uri'] train_uri = self.config['train_uri'] # Sync output of previous training run from cloud. train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size classes = ['nodata'] + self.task_config.class_map.get_class_names() data = (SegmentationItemList.from_folder(chip_dir).split_by_folder( train='train-img', valid='val-img').label_from_func( get_label_path, classes=classes).transform(get_transforms(), size=size, tfm_y=True).databunch(bs=bs)) print(data) if debug: # We make debug chips during the run-time of the train command # rather than the chip command # because this is a better test (see "visualize just before the net" # in https://karpathy.github.io/2019/04/25/recipe/), and because # it's more convenient since we have the databunch here. # TODO make color map based on colors in class_map # TODO get rid of white frame # TODO zip them def _make_debug_chips(split): debug_chips_dir = join(train_uri, '{}-debug-chips'.format(split)) make_dir(debug_chips_dir) ds = data.train_ds if split == 'train' else data.valid_ds for i, (x, y) in enumerate(ds): x.show(y=y) plt.savefig(join(debug_chips_dir, '{}.png'.format(i))) plt.close() _make_debug_chips('train') _make_debug_chips('val') # Setup learner. metrics = [semseg_acc] learn = unet_learner(data, model_arch, metrics=metrics, wd=wd, bottle=True) learn.unfreeze() if fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup ability to resume training if model exists. # This hack won't properly set the learning as a function of epochs # when resuming. learner_path = join(train_dir, 'learner.pth') log_path = join(train_dir, 'log') start_epoch = 0 if isfile(learner_path): print('Loading saved model...') start_epoch = get_last_epoch(str(log_path) + '.csv') + 1 if start_epoch >= num_epochs: print('Training is already done. If you would like to re-train' ', delete the previous results of training in ' '{}.'.format(train_uri)) return learn.load(learner_path[:-4]) print('Resuming from epoch {}'.format(start_epoch)) print( 'Note: fastai does not support a start_epoch, so epoch 0 below ' 'corresponds to {}'.format(start_epoch)) epochs_left = num_epochs - start_epoch # Setup callbacks and train model. callbacks = [ SaveModelCallback(learn, name=learner_path[:-4]), MyCSVLogger(learn, filename=log_path, start_epoch=start_epoch), SyncCallback(train_dir, train_uri, sync_interval) ] learn.fit(epochs_left, lr, callbacks=callbacks) # Export model for inference model_uri = self.config['model_uri'] model_path = get_local_path(model_uri, tmp_dir) learn.export(model_path) # Sync output to cloud. sync_to_dir(train_dir, train_uri)
import torchvision from fastai.vision import ImageDataBunch, cnn_learner, unet_learner, SegmentationItemList, imagenet_stats data = ImageDataBunch.from_csv('fixtures/classification').normalize( imagenet_stats) learner = cnn_learner(data, torchvision.models.resnet34) learner.export() data = (SegmentationItemList.from_folder( 'fixtures/segmentation/images').split_none().label_from_func( lambda x: f'fixtures/segmentation/masks/{x.stem}.jpg', classes=[0, 1, 2]).databunch().normalize(imagenet_stats)) learner = unet_learner(data, torchvision.models.resnet50) learner.export('../export.pkl')
base_loss = fv.F.l1_loss def custom_loss(input, target): mask = (target > 0) * (target < 1) return base_loss(input, target) + base_loss(input[mask], target[mask]) * 100 wd = 1e-3 learn = fv.unet_learner( data, arch, wd=wd, loss_func=custom_loss, blur=False, norm_type=fv.NormType.Weight, pretrained=False, ) fv.gc.collect() # %% # learn.lr_find() # learn.recorder.plot() #%% lr = 1e-3
def train(test, s3_data, batch): """Train a segmentation model using fastai and PyTorch on the Camvid dataset. This will write to a CSV log after each epoch, sync output to S3, and resume training from a checkpoint. Note: This is an adaptation of https://github.com/fastai/course-v3/blob/master/nbs/dl1/lesson3-camvid-tiramisu.ipynb and uses the Camvid Tiramisu-subset dataset described in the fast.ai course at half-resolution. This takes about a minute to get to around 90% accuracy on a p3.2xlarge. """ if batch: run_on_batch() # Setup hyperparams. bs = 8 wd = 1e-2 lr = 2e-3 num_epochs = 10 sample_pct = 1.0 model_arch = models.resnet34 fp16 = False sync_interval = 20 # Don't sync during training for such a small job. seed = 1234 if test: bs = 1 num_epochs = 2 sample_pct = 0.01 model_arch = models.resnet18 # Setup paths. data_uri = Path('/opt/data/camvid/CamVid') train_uri = Path('/opt/data/camvid/train') data_dir = data_uri train_dir = train_uri if s3_data: temp_dir_obj = tempfile.TemporaryDirectory() data_uri = 's3://raster-vision-lf-dev/camvid/CamVid.zip' train_uri = 's3://raster-vision-lf-dev/camvid/train' train_dir = Path(temp_dir_obj.name) / 'train' data_dir = Path(temp_dir_obj.name) / 'data' make_dir(train_dir) make_dir(data_dir) # Retrieve data and remote training directory. if s3_data: print('Downloading data...') data_zip = Path(temp_dir_obj.name) / 'CamVid.zip' s3_utils.copy_from(data_uri, str(data_zip)) zip_ref = zipfile.ZipFile(data_zip, 'r') zip_ref.extractall(data_dir) zip_ref.close() data_dir = data_dir / 'CamVid' if s3_utils.list_paths(train_uri): print('Syncing train dir...') s3_utils.sync_to_dir(train_uri, str(train_dir)) # Setup data loader. def get_y_fn(x): return Path(str(x.parent) + 'annot') / x.name fnames = get_image_files(data_dir / 'val') img = open_image(fnames[0]) src_size = np.array(img.data.shape[1:]) size = src_size // 2 data = (SegmentationItemList.from_folder(data_dir).use_partial_data( sample_pct, seed).split_by_folder(valid='val').label_from_func( get_y_fn, classes=codes).transform( get_transforms(), size=size, tfm_y=True).databunch(bs=bs).normalize(imagenet_stats)) # Setup metrics, callbacks, and then train model. metrics = [acc_camvid] model_path = train_dir / 'stage-1' log_path = train_dir / 'log' learn = unet_learner(data, model_arch, metrics=metrics, wd=wd, bottle=True) learn.unfreeze() if fp16 and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) start_epoch = 1 if os.path.isfile(str(model_path) + '.pth'): print('Loading saved model...') start_epoch = get_last_epoch(str(log_path) + '.csv') + 1 if start_epoch > num_epochs: print( 'Training already done. If you would like to re-train, delete ' 'previous results of training in {}.'.format(train_dir)) exit() learn.load(model_path) print('Resuming from epoch {}'.format(start_epoch)) print('Note: fastai does not support a start_epoch, so epoch 1 below ' 'corresponds to {}'.format(start_epoch)) callbacks = [ SaveModelCallback(learn, name=model_path), MyCSVLogger(learn, filename=log_path, start_epoch=start_epoch) ] if s3_data: callbacks.append(S3SyncCallback(train_dir, train_uri, sync_interval)) epochs_left = num_epochs - start_epoch + 1 lrs = slice(lr / 100, lr) learn.fit_one_cycle(epochs_left, lrs, pct_start=0.8, callbacks=callbacks) if s3_data: s3_utils.sync_from_dir(train_dir, train_uri)
src = (SegmentationItemList.from_df( unique_images, DATA / ('train_images' + str(SUFFIX)), cols='im_id').split_by_idx(valid_index).label_from_func(get_y_fn, classes=codes)) transforms = get_transforms(max_warp=0, max_rotate=0) data = (src.transform(get_transforms(), tfm_y=True, size=training_image_size, resize_method=ResizeMethod.PAD, padding_mode="zeros").databunch( bs=batch_size).normalize(imagenet_stats)) learn = unet_learner(data, models.resnet34, pretrained=True, metrics=[multiclass_dice, dice_50], loss_func=BCEDiceLoss(), model_dir=DATA) learn.fit_one_cycle(10, 1e-3) learn.unfreeze() learn.fit_one_cycle(60, slice(1e-6, 1e-3)) valid_dice_score = learn.recorder.metrics[-1] print("DEBUG", valid_dice_score) all_dice_scores.append(valid_dice_score) #Save model filename = MODEL_NAME + '_' + str(currentFold) print(filename) learn.export()
def train(self, tmp_dir): """Train a model. This downloads any previous output saved to the train_uri, starts training (or resumes from a checkpoint), periodically syncs contents of train_dir to train_uri and after training finishes. Args: tmp_dir: (str) path to temp directory """ self.log_options() # Sync output of previous training run from cloud. train_uri = self.backend_opts.train_uri train_dir = get_local_path(train_uri, tmp_dir) make_dir(train_dir) sync_from_dir(train_uri, train_dir) # Get zip file for each group, and unzip them into chip_dir. chip_dir = join(tmp_dir, 'chips') make_dir(chip_dir) for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'): zip_path = download_if_needed(zip_uri, tmp_dir) with zipfile.ZipFile(zip_path, 'r') as zipf: zipf.extractall(chip_dir) # Setup data loader. def get_label_path(im_path): return Path(str(im_path.parent)[:-4] + '-labels') / im_path.name size = self.task_config.chip_size class_map = self.task_config.class_map classes = class_map.get_class_names() if 0 not in class_map.get_keys(): classes = ['nodata'] + classes num_workers = 0 if self.train_opts.debug else 4 data = (SegmentationItemList.from_folder(chip_dir) .split_by_folder(train='train-img', valid='val-img')) train_count = None if self.train_opts.train_count is not None: train_count = min(len(data.train), self.train_opts.train_count) elif self.train_opts.train_prop != 1.0: train_count = int(round(self.train_opts.train_prop * len(data.train))) train_items = data.train.items if train_count is not None: train_inds = np.random.permutation(np.arange(len(data.train)))[0:train_count] train_items = train_items[train_inds] items = np.concatenate([train_items, data.valid.items]) data = (SegmentationItemList(items, chip_dir) .split_by_folder(train='train-img', valid='val-img') .label_from_func(get_label_path, classes=classes) .transform(get_transforms(flip_vert=self.train_opts.flip_vert), size=size, tfm_y=True) .databunch(bs=self.train_opts.batch_sz, num_workers=num_workers)) print(data) # Setup learner. ignore_idx = 0 metrics = [ Precision(average='weighted', clas_idx=1, ignore_idx=ignore_idx), Recall(average='weighted', clas_idx=1, ignore_idx=ignore_idx), FBeta(average='weighted', clas_idx=1, beta=1, ignore_idx=ignore_idx)] model_arch = getattr(models, self.train_opts.model_arch) learn = unet_learner( data, model_arch, metrics=metrics, wd=self.train_opts.weight_decay, bottle=True, path=train_dir) learn.unfreeze() if self.train_opts.mixed_prec and torch.cuda.is_available(): # This loss_scale works for Resnet 34 and 50. You might need to adjust this # for other models. learn = learn.to_fp16(loss_scale=256) # Setup callbacks and train model. model_path = get_local_path(self.backend_opts.model_uri, tmp_dir) pretrained_uri = self.backend_opts.pretrained_uri if pretrained_uri: print('Loading weights from pretrained_uri: {}'.format( pretrained_uri)) pretrained_path = download_if_needed(pretrained_uri, tmp_dir) learn.model.load_state_dict( torch.load(pretrained_path, map_location=learn.data.device), strict=False) # Save every epoch so that resume functionality provided by # TrackEpochCallback will work. callbacks = [ TrackEpochCallback(learn), MySaveModelCallback(learn, every='epoch'), MyCSVLogger(learn, filename='log'), ExportCallback(learn, model_path, monitor='f_beta'), SyncCallback(train_dir, self.backend_opts.train_uri, self.train_opts.sync_interval) ] oversample = self.train_opts.oversample if oversample: weights = get_oversampling_weights( data.train_ds, oversample['rare_class_ids'], oversample['rare_target_prop']) oversample_callback = OverSamplingCallback(learn, weights=weights) callbacks.append(oversample_callback) if self.train_opts.debug: if oversample: oversample_callback.on_train_begin() make_debug_chips(data, class_map, tmp_dir, train_uri) if self.train_opts.log_tensorboard: callbacks.append(TensorboardLogger(learn, 'run')) if self.train_opts.run_tensorboard: log.info('Starting tensorboard process') log_dir = join(train_dir, 'logs', 'run') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(log_dir)]) terminate_at_exit(tensorboard_process) lr = self.train_opts.lr num_epochs = self.train_opts.num_epochs if self.train_opts.one_cycle: if lr is None: learn.lr_find() learn.recorder.plot(suggestion=True, return_fig=True) lr = learn.recorder.min_grad_lr print('lr_find() found lr: {}'.format(lr)) learn.fit_one_cycle(num_epochs, lr, callbacks=callbacks) else: learn.fit(num_epochs, lr, callbacks=callbacks) if self.train_opts.run_tensorboard: tensorboard_process.terminate() # Since model is exported every epoch, we need some other way to # show that training is finished. str_to_file('done!', self.backend_opts.train_done_uri) # Sync output to cloud. sync_to_dir(train_dir, self.backend_opts.train_uri)