def get_callbacks(self): """ Gets the callbacks list; since this is multi-task, we need multiple metrics! Therefore, callbacks_list will now contain the CriterionAggregatorCallback and CriterionCallback. They calculate and record the `seg_loss` and `clf_loss`. """ from catalyst.dl.callbacks import CriterionAggregatorCallback, \ CriterionCallback seg_loss_name = self.criterion_params["seg_loss"].lower() clf_loss_name = self.criterion_params["clf_loss"].lower() criterion_cb_list = [ CriterionCallback(prefix="seg_loss", input_key="seg_targets", output_key="seg_logits", criterion_key=seg_loss_name), CriterionCallback(prefix="clf_loss", input_key="clf_targets", output_key="clf_logits", criterion_key=clf_loss_name), CriterionAggregatorCallback(prefix="loss", loss_keys=\ ["seg_loss", "clf_loss"]), ] # regular callbacks cb_name_list = list(self.cb_params.keys()) cb_name_list.remove("checkpoint_params") callbacks_list = [ callbacks.__dict__[cb_name](**self.cb_params[cb_name]) for cb_name in cb_name_list ] callbacks_list = self.load_weights(callbacks_list) + criterion_cb_list print(f"Callbacks: {[cb.__class__.__name__ for cb in callbacks_list]}") return callbacks_list
def get_callbacks(config: Dict): return [ CriterionCallback(**config["criterion_callback_params"]), OptimizerCallback(**config["optimizer_callback_params"]), CheckpointCallback(save_n_best=3), EarlyStoppingCallback(**config["early_stopping"]), ]
def get_callbacks(self): from catalyst.dl.callbacks import CriterionAggregatorCallback, \ CriterionCallback seg_loss_name = self.criterion_params["seg_loss"].lower() clf_loss_name = self.criterion_params["clf_loss"].lower() callbacks_list = [ CriterionCallback(prefix="seg_loss", input_key="seg_targets", output_key="seg_logits", criterion_key=seg_loss_name), CriterionCallback(prefix="clf_loss", input_key="clf_targets", output_key="clf_logits", criterion_key=clf_loss_name), CriterionAggregatorCallback(prefix="loss", loss_keys=\ ["seg_loss", "clf_loss"]), EarlyStoppingCallback(**self.cb_params["earlystop"]), ] ckpoint_params = self.cb_params["checkpoint_params"] if ckpoint_params["checkpoint_path"] != None: # hacky way to say no checkpoint callback but eh what the heck mode = ckpoint_params["mode"].lower() if mode == "full": print("Stateful loading...") ckpoint_p = Path(ckpoint_params["checkpoint_path"]) fname = ckpoint_p.name # everything in the path besides the base file name resume_dir = str(ckpoint_p.parents[0]) print(f"Loading {fname} from {resume_dir}. \ \nCheckpoints will also be saved in {resume_dir}.") # adding the checkpoint callback callbacks_list = callbacks_list + [CheckpointCallback(resume=fname, resume_dir=resume_dir),] elif mode == "model_only": print("Loading weights into model...") self.model = load_weights_train(ckpoint_params["checkpoint_path"], self.model) print(f"Callbacks: {callbacks_list}") return callbacks_list
# ### Running train-loop # In[27]: runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, # our dataloaders loaders=loaders, callbacks=[ # Each criterion is calculated separately. CriterionCallback(input_key="mask", prefix="loss_dice", criterion_key="dice"), CriterionCallback(input_key="mask", prefix="loss_iou", criterion_key="iou"), CriterionCallback(input_key="mask", prefix="loss_ce", criterion_key="ce"), # And only then we aggregate everything into one loss. CriterionAggregatorCallback( prefix="loss", loss_aggregate_fn= "weighted_sum", # can be "sum", "weighted_sum" or "mean" # because we want weighted sum, we need to add scale for each loss loss_keys={
valid_files = image_files[test_inds] train_labels = [getLabel(f) for f in train_files] #train_ds = tu.ITSDatasetWithPL(train_files, df_pl, train_transforms=[albu.HorizontalFlip(), albu.VerticalFlip(), albu.ShiftScaleRotate()], blur_mask=False) train_ds = tu.ITSDataset(train_files, train_transforms=[albu.HorizontalFlip(), albu.VerticalFlip(), albu.ShiftScaleRotate()], blur_mask=False) val_ds = tu.ITSDataset(valid_files) train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=6, shuffle=True) val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, num_workers=6, shuffle=False) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = val_loader callbacks = [CriterionCallback(input_key="mask", output_key="logits", criterion_key="bciou", prefix="loss"), IouCallback(input_key="mask", output_key="logits",threshold=0.5), IouCallback(input_key="mask", output_key="logits",threshold=0.4, prefix="iou04"), IouCallback(input_key="mask", output_key="logits",threshold=0.6, prefix="iou06"), OptimizerCallback(accumulation_steps=2) ] if TRAINING: if RESUME: try: cp = load_checkpoint(f"{LOGDIR}/checkpoints/best.pth") continue except Exception as e:
model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, # our dataloaders loaders=get_loaders(images=ALL_IMAGES, masks=ALL_MASKS, random_state=SEED, train_transforms_fn=train_transforms, valid_transforms_fn=valid_transforms, batch_size=config.BATCH_SIZE), callbacks=[ # Each criterion is calculated separately. CriterionCallback(input_key="mask", prefix="loss_dice", criterion_key="dice"), CriterionCallback(input_key="mask", prefix="loss_iou", criterion_key="iou"), CriterionCallback(input_key="mask", prefix="loss_bce", criterion_key="bce", multiplier=0.8), # And only then we aggregate everything into one loss. CriterionAggregatorCallback( prefix="loss", loss_keys=["loss_dice", "loss_iou", "loss_bce"], loss_aggregate_fn="sum" # or "mean" ),
def main(): # Enable argument parsing for file paths args = vars(get_args()) train_images_path = args["train_images"] train_masks_path = args["train_masks"] test_images_path = args["test_images"] test_masks_path = args["test_masks"] # print out yaml file configuration dir_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(dir_path, "config/igvc.yaml") ARCH = yaml.safe_load(open(yaml_path, "r")) # Set a seed for reproducibility utils.set_global_seed(ARCH["train"]["seed"]) utils.prepare_cudnn(deterministic=ARCH["train"]["cudnn"]) # Set up U-Net with pretrained EfficientNet backbone model = smp.Unet( encoder_name=ARCH["encoder"]["name"], encoder_weights=ARCH["encoder"]["weight"], classes=ARCH["train"]["classes"], activation=ARCH["encoder"]["activation"], ) # Get Torch loaders loaders = get_loaders( images=np.load(train_images_path), masks=np.load(train_masks_path), image_arr_path=train_images_path, mask_arr_path=train_masks_path, random_state=ARCH["train"]["random_state"], valid_size=ARCH["train"]["valid_size"], batch_size=ARCH["train"]["batch_size"], num_workers=ARCH["train"]["num_workers"], ) # Optimize for cross entropy using Adam criterion = { "CE": CrossentropyND(), } optimizer = AdamW( model.parameters(), lr=ARCH["train"]["lr"], betas=(ARCH["train"]["betas_min"], ARCH["train"]["betas_max"]), eps=float(ARCH["train"]["eps"]), weight_decay=ARCH["train"]["w_decay"], amsgrad=ARCH["train"]["amsgrad"], ) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=ARCH["train"]["optim_factor"], patience=ARCH["train"]["optim_patience"], ) device = utils.get_device() print("Using device: {}".format(device)) print(f"torch: {torch.__version__}, catalyst: {catalyst.__version__}") runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask") # Use Catalyst callbacks for metric calculations during training callbacks = [ CriterionCallback(input_key="mask", prefix="loss", criterion_key="CE"), MulticlassDiceMetricCallback(input_key="mask"), ] # Train and print model training logs runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=ARCH["train"]["logdir"], num_epochs=ARCH["train"]["epochs"], main_metric="loss", minimize_metric=ARCH["train"]["minimize_metric"], fp16=ARCH["train"]["fp16"], verbose=ARCH["train"]["verbose"], ) # Test model on test dataset test_data = SegmentationDataset(test_images_path, test_masks_path) infer_loader = DataLoader( test_data, batch_size=ARCH["test"]["batch_size"], shuffle=ARCH["test"]["shuffle"], num_workers=ARCH["test"]["num_workers"], ) # Get model predictions on test dataset predictions = np.vstack( list( map( lambda x: x["logits"].cpu().numpy(), runner.predict_loader( loader=infer_loader, resume=f"content/full_model2/checkpoints/best.pth", ), ))) save_result(predictions, test_data)
step = len(range(0, args.num_epochs, 4)) milestones = [step * i for i in range(1, 4)] scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) runner = SupervisedRunner(input_key='features', output_key=['embeddings', 'logits']) callbacks = [ AccuracyCallback( num_classes=args.num_classes, accuracy_args=[1], activation="Softmax", ), CriterionCallback(input_key="targets", prefix="loss", criterion_key="ce"), ] if args.triplet_loss: callbacks.extend([ CriterionCallback(input_key="targets", output_key="embeddings", prefix="loss", criterion_key="htl"), CriterionAggregatorCallback(prefix="loss", loss_keys=["ce", "htl"], loss_aggregate_fn="sum") ]) _callbacks = OrderedDict() callback_names = [
valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers) loaders = {"train": train_loader, "valid": valid_loader} runner = SupervisedRunner(model=model, device='cuda', input_key='image', input_target_key='mask') logdir = f'./logs/{args.model}' num_epochs = args.epochs callbacks = [ CriterionCallback(input_key='mask', multiplier=1., prefix='loss_dice', criterion_key='dice'), CriterionCallback(input_key='mask', prefix='loss_bce', multiplier=0.8, criterion_key='bce'), CriterionAggregatorCallback(prefix='loss', loss_keys=["loss_dice", "loss_bce"], loss_aggregate_fn="sum"), DiceCallback(input_key='mask'), OptimizerCallback(accumulation_steps=32), EarlyStoppingCallback(patience=8, min_delta=0.001), ] if args.checkpoint: callbacks.append( CheckpointCallback(resume=f'{logdir}/checkpoints/best_full.pth'))
batch_size=BATCH_SIZE, num_workers=6, shuffle=False) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = val_loader runner = dl.SupervisedRunner(device=tu.device, input_key="image", input_target_key="label", output_key="logits") callbacks = [ CriterionCallback(input_key="label", output_key="logits", prefix="loss"), AccuracyCallback(input_key="label", output_key="logits", prefix="acc", activation="Sigmoid"), OptimizerCallback(accumulation_steps=2), #MixupCallback(alpha=0.3, input_key="label", output_key="logits", fields=("image", )) ] if TRAINING: runner.train(model=model, criterion=nn.CrossEntropyLoss(), optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=LOGDIR,
# elif args.loss == 'lovasz_softmax': # criterion = lovasz_softmax() elif args.loss == 'BCEMulticlassDiceLoss': criterion = BCEMulticlassDiceLoss() elif args.loss == 'MulticlassDiceMetricCallback': criterion = MulticlassDiceMetricCallback() elif args.loss == 'BCE': criterion = nn.BCEWithLogitsLoss() else: criterion = smp.utils.losses.BCEDiceLoss(eps=1.) if args.multigpu: model = nn.DataParallel(model) if args.task == 'segmentation': callbacks = [DiceCallback(), EarlyStoppingCallback(patience=5, min_delta=0.001), CriterionCallback()] elif args.task == 'classification': callbacks = [AUCCallback(class_names=['Fish', 'Flower', 'Gravel', 'Sugar'], num_classes=4), EarlyStoppingCallback(patience=5, min_delta=0.001), CriterionCallback()] if args.gradient_accumulation: callbacks.append(OptimizerCallback(accumulation_steps=args.gradient_accumulation)) runner = SupervisedRunner() if args.train: runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks,
def __init__(self, logdir: str): super().__init__( model=None, loaders=None, callbacks=[], logdir=logdir, num_epochs=80, main_metric='hmar_avg', minimize_metric=False, verbose=True, monitoring_params={ "name": EXPERIMENT_NAME, "tags": ["pytorch", "catalyst", "torchvision", "densenet201"], "project": "bengali-ai" }) self._callbacks = OrderedDict(( # cross entropy ('loss_gr', CriterionCallback(input_key="grapheme_root", output_key="logit_grapheme_root", criterion_key='cross_entropy', prefix='loss_gr', multiplier=0.7)), ('loss_vd', CriterionCallback(input_key="vowel_diacritic", output_key="logit_vowel_diacritic", criterion_key='cross_entropy', prefix='loss_vd', multiplier=0.2)), ('loss_cd', CriterionCallback(input_key="consonant_diacritic", output_key="logit_consonant_diacritic", criterion_key='cross_entropy', prefix='loss_cd', multiplier=0.1)), # central loss ('central_gr', CriterionCallback(input_key="grapheme_root", output_key="features", criterion_key='central_gr', prefix='central_gr', multiplier=1e-4)), ('central_vd', CriterionCallback(input_key="vowel_diacritic", output_key="features", criterion_key='central_vd', prefix='central_vd', multiplier=1e-5)), ('central_cd', CriterionCallback(input_key="consonant_diacritic", output_key="features", criterion_key='central_cd', prefix='central_cd', multiplier=1e-5)), # aggregator ('loss', CriterionAggregatorCallback(prefix="loss", loss_aggregate_fn="sum", loss_keys=[ "loss_gr", "loss_vd", "loss_cd", "central_gr", "central_vd", "central_cd" ])), ('early_stopping', catalyst.dl.EarlyStoppingCallback(4, 'hmar_avg', minimize=False)), ('hmar_gr', HMacroAveragedRecall(input_key="grapheme_root", output_key="logit_grapheme_root", prefix="hmar_gr")), ('hmar_wd', HMacroAveragedRecall(input_key="vowel_diacritic", output_key="logit_vowel_diacritic", prefix="hmar_wd")), ('hmar_cd', HMacroAveragedRecall(input_key="consonant_diacritic", output_key="logit_consonant_diacritic", prefix="hmar_cd")), ('hmar_avg', AverageMetric(prefix="hmar_avg", metrics=["hmar_gr", "hmar_wd", "hmar_cd"], weights=[2, 1, 1])), )) self._criterion = { 'cross_entropy': nn.CrossEntropyLoss(), 'central_cd': CenterLoss(num_classes=7, feat_dim=1920, use_gpu=use_gpu), 'central_gr': CenterLoss(num_classes=168, feat_dim=1920, use_gpu=use_gpu), 'central_vd': CenterLoss(num_classes=11, feat_dim=1920, use_gpu=use_gpu), }
def main(config): opts = config() path = opts.path train = pd.read_csv(f'{path}/train.csv') pseudo_label = pd.read_csv( './submissions/submission_segmentation_and_classifier.csv') n_train = len(os.listdir(f'{path}/train_images')) n_test = len(os.listdir(f'{path}/test_images')) print(f'There are {n_train} images in train dataset') print(f'There are {n_test} images in test dataset') train.loc[train['EncodedPixels'].isnull() == False, 'Image_Label'].apply(lambda x: x.split('_')[1]).value_counts() train.loc[train['EncodedPixels'].isnull() == False, 'Image_Label'].apply( lambda x: x.split('_')[0]).value_counts().value_counts() train['label'] = train['Image_Label'].apply(lambda x: x.split('_')[1]) train['im_id'] = train['Image_Label'].apply(lambda x: x.split('_')[0]) id_mask_count = train.loc[train['EncodedPixels'].isnull() == False, 'Image_Label'].apply(lambda x: x.split('_')[ 0]).value_counts().reset_index().rename( columns={ 'index': 'img_id', 'Image_Label': 'count' }) print(id_mask_count.head()) pseudo_label.loc[pseudo_label['EncodedPixels'].isnull() == False, 'Image_Label'].apply( lambda x: x.split('_')[1]).value_counts() pseudo_label.loc[pseudo_label['EncodedPixels'].isnull() == False, 'Image_Label'].apply(lambda x: x.split('_')[0] ).value_counts().value_counts() pseudo_label['label'] = pseudo_label['Image_Label'].apply( lambda x: x.split('_')[1]) pseudo_label['im_id'] = pseudo_label['Image_Label'].apply( lambda x: x.split('_')[0]) pseudo_label_ids = pseudo_label.loc[ pseudo_label['EncodedPixels'].isnull() == False, 'Image_Label'].apply( lambda x: x.split('_')[0]).value_counts().reset_index().rename( columns={ 'index': 'img_id', 'Image_Label': 'count' }) print(pseudo_label_ids.head()) if not os.path.exists("csvs/train_all.csv"): train_ids, valid_ids = train_test_split( id_mask_count, random_state=39, stratify=id_mask_count['count'], test_size=0.1) valid_ids.to_csv("csvs/valid_threshold.csv", index=False) train_ids.to_csv("csvs/train_all.csv", index=False) else: train_ids = pd.read_csv("csvs/train_all.csv") valid_ids = pd.read_csv("csvs/valid_threshold.csv") for fold, ((train_ids_new, valid_ids_new), (train_ids_pl, valid_ids_pl)) in enumerate( zip( stratified_groups_kfold(train_ids, target='count', n_splits=opts.fold_max, random_state=0), stratified_groups_kfold(pseudo_label_ids, target='count', n_splits=opts.fold_max, random_state=0))): train_ids_new.to_csv(f'csvs/train_fold{fold}.csv') valid_ids_new.to_csv(f'csvs/valid_fold{fold}.csv') train_ids_new = train_ids_new['img_id'].values valid_ids_new = valid_ids_new['img_id'].values train_ids_pl = train_ids_pl['img_id'].values valid_ids_pl = valid_ids_pl['img_id'].values ENCODER = opts.backborn ENCODER_WEIGHTS = opts.encoder_weights DEVICE = 'cuda' ACTIVATION = None model = get_model( model_type=opts.model_type, encoder=ENCODER, encoder_weights=ENCODER_WEIGHTS, activation=ACTIVATION, n_classes=opts.class_num, task=opts.task, center=opts.center, attention_type=opts.attention_type, head='simple', classification=opts.classification, ) model = convert_model(model) preprocessing_fn = encoders.get_preprocessing_fn( ENCODER, ENCODER_WEIGHTS) num_workers = opts.num_workers bs = opts.batchsize train_dataset = CloudDataset( df=train, label_smoothing_eps=opts.label_smoothing_eps, datatype='train', img_ids=train_ids_new, transforms=get_training_augmentation(opts.img_size), preprocessing=get_preprocessing(preprocessing_fn)) valid_dataset = CloudDataset( df=train, datatype='valid', img_ids=valid_ids_new, transforms=get_validation_augmentation(opts.img_size), preprocessing=get_preprocessing(preprocessing_fn)) ################# make pseudo label dataset ####################### train_dataset_pl = CloudPseudoLabelDataset( df=pseudo_label, datatype='train', img_ids=train_ids_pl, transforms=get_training_augmentation(opts.img_size), preprocessing=get_preprocessing(preprocessing_fn)) valid_dataset_pl = CloudPseudoLabelDataset( df=pseudo_label, datatype='train', img_ids=valid_ids_pl, transforms=get_validation_augmentation(opts.img_size), preprocessing=get_preprocessing(preprocessing_fn)) # train_dataset = ConcatDataset([train_dataset, train_dataset_pl]) # valid_dataset = ConcatDataset([valid_dataset, valid_dataset_pl]) train_dataset = ConcatDataset([train_dataset, valid_dataset_pl]) ################# make pseudo label dataset ####################### train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=bs, shuffle=False, num_workers=num_workers, drop_last=True) loaders = {"train": train_loader, "valid": valid_loader} num_epochs = opts.max_epoch logdir = f"{opts.logdir}/fold{fold}" optimizer = get_optimizer(optimizer=opts.optimizer, lookahead=opts.lookahead, model=model, separate_decoder=True, lr=opts.lr, lr_e=opts.lr_e) opt_level = 'O1' model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) scheduler = opts.scheduler(optimizer) criterion = opts.criterion runner = SupervisedRunner() if opts.task == "segmentation": callbacks = [DiceCallback()] else: callbacks = [] if opts.early_stop: callbacks.append( EarlyStoppingCallback(patience=10, min_delta=0.001)) if opts.mixup: callbacks.append(MixupCallback(alpha=0.25)) if opts.accumeration is not None: callbacks.append(CriterionCallback()) callbacks.append( OptimizerCallback(accumulation_steps=opts.accumeration)) print( f"############################## Start training of fold{fold}! ##############################" ) runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=logdir, num_epochs=num_epochs, verbose=True) print( f"############################## Finish training of fold{fold}! ##############################" ) del model del loaders del runner torch.cuda.empty_cache() gc.collect()
def train_model(): model = smp.Unet( encoder_name=ENCODER, encoder_weights=ENCODER_WEIGHTS, classes=4, activation=ACTIVATION, ) preprocessing_fn = smp.encoders.get_preprocessing_fn( ENCODER, ENCODER_WEIGHTS) num_workers = 0 bs = 5 train_dataset = CloudDataset( df=train, datatype='train', img_ids=train_ids, transforms=get_training_augmentation(), preprocessing=get_preprocessing(preprocessing_fn)) valid_dataset = CloudDataset( df=train, datatype='valid', img_ids=valid_ids, transforms=get_validation_augmentation(), preprocessing=get_preprocessing(preprocessing_fn)) train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True, num_workers=num_workers) valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=num_workers) loaders = {"train": train_loader, "valid": valid_loader} num_epochs = 40 # model, criterion, optimizer optimizer = RAdam([ { 'params': model.decoder.parameters(), 'lr': 1e-2 }, { 'params': model.encoder.parameters(), 'lr': 1e-3 }, ]) scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2, threshold=0.001) criterion = smp.utils.losses.BCEDiceLoss(eps=1.) runner = SupervisedRunner() runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[ DiceCallback(), EarlyStoppingCallback(patience=5, min_delta=0.001), CriterionCallback(), OptimizerCallback(accumulation_steps=2) ], logdir=logdir, num_epochs=num_epochs, verbose=True) return True
criterion = BCEMulticlassDiceLoss() elif args.loss == 'MulticlassDiceMetricCallback': criterion = MulticlassDiceMetricCallback() elif args.loss == 'BCE': criterion = nn.BCEWithLogitsLoss() else: criterion = smp.utils.losses.BCEDiceLoss(eps=1.) if args.multigpu: model = nn.DataParallel(model) if args.task == 'segmentation': callbacks = [ DiceCallback(), EarlyStoppingCallback(patience=5, min_delta=0.001), CriterionCallback() ] elif args.task == 'classification': callbacks = [ AUCCallback(class_names=['Fish', 'Flower', 'Gravel', 'Sugar'], num_classes=4), EarlyStoppingCallback(patience=5, min_delta=0.001), CriterionCallback() ] if args.gradient_accumulation: callbacks.append( OptimizerCallback(accumulation_steps=args.gradient_accumulation)) runner = SupervisedRunner() if args.train:
criterion = BCEMulticlassDiceLoss() elif args.loss == "MulticlassDiceMetricCallback": criterion = MulticlassDiceMetricCallback() elif args.loss == "BCE": criterion = nn.BCEWithLogitsLoss() else: criterion = smp.utils.losses.BCEDiceLoss(eps=1.0) if args.multigpu: model = nn.DataParallel(model) if args.task == "segmentation": callbacks = [ DiceCallback(), EarlyStoppingCallback(patience=10, min_delta=0.001), CriterionCallback(), ] elif args.task == "classification": callbacks = [ AUCCallback(class_names=["Fish", "Flower", "Gravel", "Sugar"], num_classes=4), EarlyStoppingCallback(patience=10, min_delta=0.001), CriterionCallback(), ] if args.gradient_accumulation: callbacks.append( OptimizerCallback(accumulation_steps=args.gradient_accumulation)) checkpoint = utils.load_checkpoint(f"{logdir}/checkpoints/best.pth") model.cuda()
# elif args.loss == 'lovasz_softmax': # criterion = lovasz_softmax() elif args.loss == 'BCEMulticlassDiceLoss': criterion = BCEMulticlassDiceLoss() elif args.loss == 'MulticlassDiceMetricCallback': criterion = MulticlassDiceMetricCallback() elif args.loss == 'BCE': criterion = nn.BCEWithLogitsLoss() else: criterion = smp.utils.losses.BCEDiceLoss(eps=1.) if args.multigpu: model = nn.DataParallel(model) if args.task == 'segmentation': callbacks = [DiceCallback(), EarlyStoppingCallback(patience=10, min_delta=0.001), CriterionCallback()] elif args.task == 'classification': callbacks = [AUCCallback(class_names=['Fish', 'Flower', 'Gravel', 'Sugar'], num_classes=4), EarlyStoppingCallback(patience=10, min_delta=0.001), CriterionCallback()] if args.gradient_accumulation: callbacks.append(OptimizerCallback(accumulation_steps=args.gradient_accumulation)) checkpoint = utils.load_checkpoint(f'{logdir}/checkpoints/best.pth') model.cuda() utils.unpack_checkpoint(checkpoint, model=model) # # runner = SupervisedRunner() if args.train: print('Training')
) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=optim_factor, patience=optim_patience) num_epochs = 10 device = utils.get_device() runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask") # Use Catalyst callbacks for metric calculations during training callbacks = [ CriterionCallback(input_key="mask", prefix="loss", criterion_key="CE"), MulticlassDiceMetricCallback(input_key="mask"), ] # Train and print model training logs runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir="content/full_model2", num_epochs=num_epochs, main_metric="loss", minimize_metric=True,
'train': dataloader_train, 'valid': dataloader_val } #collections.OrderedDict({'train': dataloader_train, 'valid': dataloader_val}) model = ReverseModel() optimizer = Lookahead(RAdam(params=model.parameters(), lr=1e-3)) criterion = {"bce": nn.BCEWithLogitsLoss()} scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) callbacks = [ CriterionCallback(input_key='start', prefix="loss", criterion_key="bce"), EarlyStoppingCallback(patience=5), ] runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir="./logs", num_epochs=5, #TODO main_metric="loss", minimize_metric=True, verbose=True,
def run(config_file): config = load_config(config_file) #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) #save the configuration to the working dir if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') #Enter the GPUS you have, os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) #our dataset has an explicit validation folder, use that later. all_transforms['valid'] = get_transforms(config.transforms.test) print("before rajat config", config.data.height, config.data.width) #fetch the dataloaders we need dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } #creating the segmentation model with pre-trained encoder ''' dumping the parameters for smp library encoder_name: str = "resnet34", encoder_depth: int = 5, encoder_weights: str = "imagenet", decoder_use_batchnorm: bool = True, decoder_channels: List[int] = (256, 128, 64, 32, 16), decoder_attention_type: Optional[str] = None, in_channels: int = 3, classes: int = 1, activation: Optional[Union[str, callable]] = None, aux_params: Optional[dict] = None, ''' model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) #fetch the loss criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) ''' dumping the catalyst supervised runner https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py model (Model): Torch model object device (Device): Torch device input_key (str): Key in batch dict mapping for model input output_key (str): Key in output dict model output will be stored under input_target_key (str): Key in batch dict mapping for target ''' runner = SupervisedRunner(model=model, device=get_device()) #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks callbacks = [DiceCallback(), IouCallback()] #adding patience if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) #thanks for handling the distributed training ''' we are gonna take zero_grad after accumulation accumulation_steps ''' if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend([ CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps) ]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) ''' pudae добавь пожалуйста обратный вызов https://arxiv.org/pdf/1710.09412.pdf **srk adding the mixup callback ''' if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) '''@rajat implemented cutmix, a wieghed combination of cutout and mixup ''' callbacks.append(MixupCallback()) callbacks.append(CutMixCallback()) ''' rajat introducing training loop https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py take care of the nvidias fp16 precision ''' print(config.work_dir) print(config.train.minimize_metric) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=False, )
def main(): parser = argparse.ArgumentParser() parser.add_argument('--encoder', type=str, default='efficientnet-b0') parser.add_argument('--model', type=str, default='unet') parser.add_argument('--pretrained', type=str, default='imagenet') parser.add_argument('--logdir', type=str, default='../logs/') parser.add_argument('--exp_name', type=str) parser.add_argument('--data_folder', type=str, default='../input/') parser.add_argument('--height', type=int, default=320) parser.add_argument('--width', type=int, default=640) parser.add_argument('--batch_size', type=int, default=2) parser.add_argument('--accumulate', type=int, default=8) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--enc_lr', type=float, default=1e-2) parser.add_argument('--dec_lr', type=float, default=1e-3) parser.add_argument('--optim', type=str, default="radam") parser.add_argument('--loss', type=str, default="bcedice") parser.add_argument('--schedule', type=str, default="rlop") parser.add_argument('--early_stopping', type=bool, default=True) args = parser.parse_args() encoder = args.encoder model = args.model pretrained = args.pretrained logdir = args.logdir name = args.exp_name data_folder = args.data_folder height = args.height width = args.width bs = args.batch_size accumulate = args.accumulate epochs = args.epochs enc_lr = args.enc_lr dec_lr = args.dec_lr optim = args.optim loss = args.loss schedule = args.schedule early_stopping = args.early_stopping if model == 'unet': model = smp.Unet(encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None) if model == 'fpn': model = smp.FPN( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'pspnet': model = smp.PSPNet( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'linknet': model = smp.Linknet( encoder_name=encoder, encoder_weights=pretrained, classes=4, activation=None, ) if model == 'aspp': print('aspp can only be used with resnet34') model = aspp(num_class=4) preprocessing_fn = smp.encoders.get_preprocessing_fn(encoder, pretrained) log = os.path.join(logdir, name) ds = get_dataset(path=data_folder) prepared_ds = prepare_dataset(ds) train_set, valid_set = get_train_test(ds) train_ds = CloudDataset(df=prepared_ds, datatype='train', img_ids=train_set, transforms=training1(h=height, w=width), preprocessing=get_preprocessing(preprocessing_fn), folder=data_folder) valid_ds = CloudDataset(df=prepared_ds, datatype='train', img_ids=valid_set, transforms=valid1(h=height, w=width), preprocessing=get_preprocessing(preprocessing_fn), folder=data_folder) train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=multiprocessing.cpu_count()) valid_loader = DataLoader(valid_ds, batch_size=bs, shuffle=False, num_workers=multiprocessing.cpu_count()) loaders = { 'train': train_loader, 'valid': valid_loader, } num_epochs = epochs if args.model != "aspp": if optim == "radam": optimizer = RAdam([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "adam": optimizer = Adam([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "adamw": optimizer = AdamW([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) if optim == "sgd": optimizer = SGD([ { 'params': model.encoder.parameters(), 'lr': enc_lr }, { 'params': model.decoder.parameters(), 'lr': dec_lr }, ]) elif args.model == 'aspp': if optim == "radam": optimizer = RAdam([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "adam": optimizer = Adam([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "adamw": optimizer = AdamW([ { 'params': model.parameters(), 'lr': enc_lr }, ]) if optim == "sgd": optimizer = SGD([ { 'params': model.parameters(), 'lr': enc_lr }, ]) scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5) if schedule == "rlop": scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=3) if schedule == "noam": scheduler = NoamLR(optimizer, 10) if loss == "bcedice": criterion = smp.utils.losses.BCEDiceLoss(eps=1.) if loss == "dice": criterion = smp.utils.losses.DiceLoss(eps=1.) if loss == "bcejaccard": criterion = smp.utils.losses.BCEJaccardLoss(eps=1.) if loss == "jaccard": criterion == smp.utils.losses.JaccardLoss(eps=1.) if loss == 'bce': criterion = NewBCELoss() callbacks = [NewDiceCallback(), CriterionCallback()] callbacks.append(OptimizerCallback(accumulation_steps=accumulate)) if early_stopping: callbacks.append(EarlyStoppingCallback(patience=5, min_delta=0.001)) runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=log, num_epochs=num_epochs, verbose=True, )
def run(config_file): config = load_config(config_file) if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug ) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) # train setting criterion = get_loss(config) params = [ {'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr}, {'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr}, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model, device=get_device()) callbacks = [DiceCallback(), IouCallback()] if config.train.early_stop_patience > 0: callbacks.append(EarlyStoppingCallback( patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps)] ) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback( resume=config.work_dir + '/checkpoints/last_full.pth')) if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=True, )