def compute(self) -> Any: """ Returns: Confusion matrix of K rows and K columns, where rows corresponds to ground-truth targets and columns corresponds to predicted targets. """ # ddp hotfix, could be done better # but metric must handle DDP on it's own if self._ddp_backend == "xla": # if you have "RuntimeError: Aborted: Session XXX is not found" here # please, ask Google for a more powerful TPU setup ;) device = get_device() value = torch.tensor([self.conf], device=device) self.conf = xm.all_gather(value).sum(0).cpu().detach().numpy() elif self._ddp_backend == "ddp": value: List[np.ndarray] = all_gather(self.conf) value: np.ndarray = np.sum(np.stack(value, axis=0), axis=0) self.conf = value if self.normalized: conf = self.conf.astype(np.float32) return conf / conf.sum(1).clip(min=1e-12)[:, None] else: return self.conf
def compute(self) -> Tuple[torch.Tensor, float, float, float]: """Computes the AUC metric based on saved statistics.""" targets = torch.cat(self.targets) scores = torch.cat(self.scores) # ddp hotfix, could be done better # but metric must handle DDP on it's own if self._ddp_backend == "xla": # if you have "RuntimeError: Aborted: Session XXX is not found" here # please, ask Google for a more powerful TPU setup ;) device = get_device() scores = xm.all_gather(scores.to(device)).cpu().detach() targets = xm.all_gather(targets.to(device)).cpu().detach() elif self._ddp_backend == "ddp": scores = torch.cat(all_gather(scores)) targets = torch.cat(all_gather(targets)) scores, targets, _, _ = process_multilabel_components( outputs=scores, targets=targets ) per_class = auc(scores=scores, targets=targets) micro = binary_auc(scores=scores.view(-1), targets=targets.view(-1))[0] macro = per_class.mean().item() weights = targets.sum(axis=0) / len(targets) weighted = (per_class * weights).sum().item() if self.compute_per_class_metrics: return per_class, micro, macro, weighted else: return [], micro, macro, weighted
def __init__(self, config: dict): """ Args: config (dict): Attributes: config-related: config (dict): io_params (dict): in_dir (key: str): path to the data folder test_size (key: float): split size for test split_seed (key: int): seed batch_size (key: int): <- num_workers (key: int): # of workers for data loaders split_dict (dict): test_ids test_dset (torch.data.Dataset): <- loaders (dict): train/validation loaders model (torch.nn.Module): <- """ # for reuse self.config = config self.io_params = config["io_params"] # initializing the experiment components self.case_list = self.setup_im_ids() test_ids = self.get_split()[-1] if config["with_masks"] else self.case_list print(f"Inferring on {len(test_ids)} test cases") self.test_dset = self.get_datasets(test_ids) self.loaders = self.get_loaders() self.model = self.get_model().to(get_device()) self.load_weights() print(f"Device: {get_device()}")
def compute(self) -> Any: """ Compute precision, recall, f1 score and support. Compute micro, macro and weighted average for the metrics. Returns: list of aggregated metrics: per-class, micro, macro and weighted averaging of precision, recall, f1 score and support metrics """ # ddp hotfix, could be done better # but metric must handle DDP on it's own if self._ddp_backend == "xla": device = get_device() for key in self.statistics: key_statistics = torch.tensor([self.statistics[key]], device=device) key_statistics = xm.all_gather(key_statistics).sum( dim=0).cpu().numpy() self.statistics[key] = key_statistics elif self._ddp_backend == "ddp": for key in self.statistics: value: List[np.ndarray] = all_gather(self.statistics[key]) value: np.ndarray = np.sum(np.vstack(value), axis=0) self.statistics[key] = value per_class, micro, macro, weighted = get_aggregated_metrics( tp=self.statistics["tp"], fp=self.statistics["fp"], fn=self.statistics["fn"], support=self.statistics["support"], zero_division=self.zero_division, ) return per_class, micro, macro, weighted
def __init__( self, agent: Union[ActorSpec, CriticSpec], env: EnvironmentSpec, db_server: DBSpec = None, exploration_handler: ExplorationHandler = None, logdir: str = None, id: int = 0, mode: str = "infer", # train/valid/infer deterministic: bool = None, weights_sync_period: int = 1, weights_sync_mode: str = None, sampler_seed: int = 42, trajectory_seeds: List = None, trajectory_limit: int = None, force_store: bool = False, gc_period: int = 10, monitoring_params: Dict = None, **kwargs): self._device = utils.get_device() self._sampler_id = id self._deterministic = deterministic \ if deterministic is not None \ else mode in ["valid", "infer"] self.trajectory_seeds = trajectory_seeds self._seeder = tools.Seeder(init_seed=sampler_seed) # logging self._prepare_logger(logdir, mode) self._sampling_flag = mp.Value(c_bool, False) self._training_flag = mp.Value(c_bool, True) # environment, model, exploration & action handlers self.env = env self.agent = agent self.exploration_handler = exploration_handler self.trajectory_index = 0 self.trajectory_sampler = TrajectorySampler( env=self.env, agent=self.agent, device=self._device, deterministic=self._deterministic, sampling_flag=self._sampling_flag) # synchronization configuration self.db_server = db_server self._weights_sync_period = weights_sync_period self._weights_sync_mode = weights_sync_mode self._trajectory_limit = trajectory_limit or np.iinfo(np.int32).max self._force_store = force_store self._gc_period = gc_period self._db_loop_thread = None self.checkpoint = None # special self.monitoring_params = monitoring_params self._init(**kwargs)
def main(args, unknown_args): args, config = parse_args_uargs(args, unknown_args) set_global_seed(args.seed) prepare_cudnn(args.deterministic, args.benchmark) if args.logdir is not None: os.makedirs(args.logdir, exist_ok=True) dump_environment(config, args.logdir, args.configs) if args.expdir is not None: module = import_module(expdir=args.expdir) # noqa: F841 if args.logdir is not None: dump_code(args.expdir, args.logdir) env = ENVIRONMENTS.get_from_params(**config["environment"]) algorithm_name = config["algorithm"].pop("algorithm") if algorithm_name in OFFPOLICY_ALGORITHMS_NAMES: ALGORITHMS = OFFPOLICY_ALGORITHMS trainer_fn = OffpolicyTrainer sync_epoch = False elif algorithm_name in ONPOLICY_ALGORITHMS_NAMES: ALGORITHMS = ONPOLICY_ALGORITHMS trainer_fn = OnpolicyTrainer sync_epoch = True else: # @TODO: add registry for algorithms, trainers, samplers raise NotImplementedError() db_server = DATABASES.get_from_params( **config.get("db", {}), sync_epoch=sync_epoch ) algorithm_fn = ALGORITHMS.get(algorithm_name) algorithm = algorithm_fn.prepare_for_trainer(env_spec=env, config=config) if args.resume is not None: checkpoint = utils.load_checkpoint(filepath=args.resume) checkpoint = utils.any2device(checkpoint, utils.get_device()) algorithm.unpack_checkpoint( checkpoint=checkpoint, with_optimizer=False ) monitoring_params = config.get("monitoring_params", None) trainer = trainer_fn( algorithm=algorithm, env_spec=env, db_server=db_server, logdir=args.logdir, monitoring_params=monitoring_params, **config["trainer"], ) trainer.run()
def __init__(self, path: Union[str, Path], inputs: torch.Tensor): """ Args: path (Union[str, Path]): Path to traced model. inputs: Input samples. """ super().__init__(CallbackOrder.external) self.path: Path = Path(path) self.inputs: torch.Tensor = inputs self.device = get_device()
def __init__( self, agent: Union[ActorSpec, CriticSpec], env: EnvironmentSpec, db_server: DBSpec = None, exploration_handler: ExplorationHandler = None, logdir: str = None, id: int = 0, mode: str = "infer", # train/valid/infer weights_sync_period: int = 1, weights_sync_mode: str = None, seeds: List = None, trajectory_limit: int = None, force_store: bool = False, gc_period: int = 10, ): self._device = utils.get_device() self._sampler_id = id self._infer = mode == "infer" self.seeds = seeds self._seeder = Seeder( init_seed=42 + id, max_seed=len(seeds) if seeds is not None else None) # logging self._prepare_logger(logdir, mode) self._sample_flag = mp.Value(c_bool, False) # environment, model, exploration & action handlers self.env = env self.agent = agent self.exploration_handler = exploration_handler self.trajectory_index = 0 self.trajectory_sampler = TrajectorySampler( env=self.env, agent=self.agent, device=self._device, deterministic=self._infer, sample_flag=self._sample_flag) # synchronization configuration self.db_server = db_server self._weights_sync_period = weights_sync_period self._weights_sync_mode = weights_sync_mode self._trajectory_limit = trajectory_limit or np.iinfo(np.int32).max self._force_store = force_store self._gc_period = gc_period self._db_loop_thread = None
def compute(self): """ Compute metrics with accumulated statistics Returns: tuple of metrics: per_class, micro_metric, macro_metric, weighted_metric(None if weights is None) """ per_class = [] total_statistics = {} macro_metric = 0 weighted_metric = 0 # ddp hotfix, could be done better # but metric must handle DDP on it's own # TODO: optimise speed if self._ddp_backend == "xla": device = get_device() for _, statistics in self.statistics.items(): for key in statistics: value = torch.tensor([statistics[key]], device=device) statistics[key] = xm.all_gather(value).sum(dim=0) elif self._ddp_backend == "ddp": for _, statistics in self.statistics.items(): for key in statistics: value: List[torch.Tensor] = all_gather(statistics[key]) value: torch.Tensor = torch.sum(torch.vstack(value), dim=0) statistics[key] = value for class_idx, statistics in self.statistics.items(): value = self.metric_fn(**statistics) per_class.append(value) macro_metric += value if self.weights is not None: weighted_metric += value * self.weights[class_idx] for stats_name, value in statistics.items(): total_statistics[stats_name] = ( total_statistics.get(stats_name, 0) + value) macro_metric /= len(self.statistics) micro_metric = self.metric_fn(**total_statistics) if self.weights is None: weighted_metric = None if self.compute_per_class_metrics: return per_class, micro_metric, macro_metric, weighted_metric else: return [], micro_metric, macro_metric, weighted_metric
def get_criterion(self): """ Fetches the criterion. (Only one loss.) """ loss_name = self.criterion_params["loss"] loss_kwargs = self.criterion_params[loss_name] if "weight" in list(loss_kwargs.keys()): if isinstance(loss_kwargs["weight"], list): weight_tensor = torch.tensor(loss_kwargs["weight"]) weight_tensor = any2device(weight_tensor, get_device()) print(f"Converted the `weight` argument in {loss_name}", f" to a {weight_tensor.type()}...") loss_kwargs["weight"] = weight_tensor loss_cls = globals()[loss_name] loss = loss_cls(**loss_kwargs) print(f"Criterion: {loss}") return loss
def main(args, _=None): """Run the ``catalyst-data image2embeddings`` script.""" global IMG_SIZE utils.set_global_seed(args.seed) utils.prepare_cudnn(args.deterministic, args.benchmark) IMG_SIZE = (args.img_size, args.img_size) # noqa: WPS442 if args.traced_model is not None: device = utils.get_device() model = torch.jit.load(str(args.traced_model), map_location=device) else: model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, _, _, _, device = utils.process_components(model=model) df = pd.read_csv(args.in_csv) df = df.reset_index().drop("index", axis=1) df = list(df.to_dict("index").values()) open_fn = ImageReader( input_key=args.img_col, output_key="image", rootpath=args.rootpath ) dataloader = utils.get_loader( df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer, ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: batch_features = model(batch["image"].to(device)) batch_features = batch_features.cpu().detach().numpy() features.append(batch_features) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def main(args): logdir = "./logdir" num_epochs = 42 # detect gpu device = utils.get_device() utils.fp print(f"device: {device}") # dataset trainset = ImageNetK( '/run/media/mooziisp/仓库/datasets/Kaggle-ILSVRC/ILSVRC', split='train', transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.ToTensor() ])) trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2, pin_memory=True) loaders = {"train": trainloader} # define net net = models.resnet18(pretrained=False, num_classes=1000) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=1e-4) # trainer runner = SupervisedRunner(device=device) runner.train(model=net, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, callbacks=[AccuracyCallback(num_classes=1000)], num_epochs=num_epochs, verbose=True)
def setup_runtime(cfg_env: DictConfig): """ Setup runtime environment. Runtime options: ["cuda", "cuda:0", "cuda:<index>", "cpu", ""] Args: cfg_env (dict): Configuration Returns: None """ runtime: str = cfg_env.runtime runtime_name, runtime_devices = \ runtime.split(":") if ":" in runtime else [runtime, ""] if runtime_name == "cuda" and runtime_devices: os.environ["CUDA_VISIBLE_DEVICES"] = f"{runtime_devices}" logger.info(f"[Environment] Configuration: CUDA_VISIBLE_DEVICES=" f"{os.environ['CUDA_VISIBLE_DEVICES']}") elif runtime_name == "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = "" logger.info(f"[Environment] Configuration: CUDA_VISIBLE_DEVICES=" f"{os.environ['CUDA_VISIBLE_DEVICES']}") from catalyst.utils import set_global_seed, prepare_cudnn, get_device seed: int = cfg_env.seed set_global_seed(seed) logger.info(f"[Environment] Configuration. Seed: {seed}") prepare_cudnn(deterministic=True, benchmark=False) logger.info(f"[Environment] Configuration. CUDNN: " f"deterministic=True, benchmark=False") device = get_device() logger.info(f"[Environment] Runtime: {device}") return device
# This function removes weight_decay for biases and applies our layerwise_params model_params = utils.process_model_params(model, layerwise_params=layerwise_params) # Catalyst has new SOTA optimizers out of box base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) num_epochs = 3 logdir = "./logs/segmentation" device = utils.get_device() print(f"device: {device}") # by default SupervisedRunner uses "features" and "targets", # in our case we get "image" and "mask" keys in dataset __getitem__ runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask") SEED = config.SEED utils.set_global_seed(SEED) utils.prepare_cudnn(deterministic=True) runner.train( model=model, criterion=criterion,
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 methond device (Device, optional): device """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None: device = utils.get_device() model: Model = utils.maybe_recursive_call(model, "to", device=device) if utils.is_wrapped_with_ddp(model): pass elif get_rank() >= 0: assert isinstance(model, nn.Module) local_rank = distributed_params.pop("local_rank", 0) device = f"cuda:{local_rank}" model = utils.maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) use_apex = distributed_params.pop("apex", True) and is_apex_available() if use_apex: import apex amp_params = get_default_params(apex.amp.initialize, ["models", "optimizers"]) amp_params["opt_level"] = "O0" for dp in distributed_params: if dp in amp_params: amp_params[dp] = distributed_params[dp] amp_result = apex.amp.initialize(model, optimizer, **amp_params) if optimizer is not None: model, optimizer = amp_result else: model = amp_result model = apex.parallel.DistributedDataParallel(model) if syncbn: model = apex.parallel.convert_syncbn_model(model) else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif torch.cuda.device_count() > 1: if isinstance(model, nn.Module): model = torch.nn.DataParallel(model) elif isinstance(model, dict): model = {k: torch.nn.DataParallel(v) for k, v in model.items()} model: Model = utils.maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def run_ml_pipeline(sampler_inbatch: data.IInbatchTripletSampler) -> float: """ Full metric learning pipeline, including train and val. This function is also used as minimal example in README.md, section name: 'CV - MNIST with Metric Learning'. Args: sampler_inbatch: sampler to forming triplets Returns: best metric value """ # 1. train and valid datasets dataset_root = "./data" transforms = t.Compose([t.ToTensor(), t.Normalize((0.1307, ), (0.3081, ))]) dataset_train = datasets.MnistMLDataset( root=dataset_root, train=True, download=True, transform=transforms, ) sampler = data.BalanceBatchSampler(labels=dataset_train.get_labels(), p=5, k=10) train_loader = DataLoader(dataset=dataset_train, sampler=sampler, batch_size=sampler.batch_size) dataset_val = datasets.MnistQGDataset(root=dataset_root, transform=transforms, gallery_fraq=0.2) val_loader = DataLoader(dataset=dataset_val, batch_size=1024) # 2. model and optimizer model = models.SimpleConv(features_dim=16) optimizer = Adam(model.parameters(), lr=0.0005) # 3. criterion with triplets sampling criterion = nn.TripletMarginLossWithSampler( margin=0.5, sampler_inbatch=sampler_inbatch) # 4. training with catalyst Runner callbacks = [ dl.ControlFlowCallback(dl.CriterionCallback(), loaders="train"), dl.ControlFlowCallback(dl.CMCScoreCallback(topk_args=[1]), loaders="valid"), dl.PeriodicLoaderCallback(valid=100), ] runner = dl.SupervisedRunner(device=utils.get_device()) runner.train( model=model, criterion=criterion, optimizer=optimizer, callbacks=callbacks, loaders={ "train": train_loader, "valid": val_loader }, minimize_metric=False, verbose=True, valid_loader="valid", num_epochs=100, main_metric="cmc01", ) return runner.best_valid_metrics["cmc01"]
def main(): # set your params DATA_PATH = '/content/drive/My Drive/kaggle/bengaliai-cv19/dataset' # MODEL_PATH = '/content/drive/My Drive/kaggle/bengaliai-cv19/model/se_resnext50_32x4d-a260b3a4.pth' # MODEL_PATH='/content/drive/My Drive/kaggle/bengaliai-cv19/model/efficientnet-b3-5fb5a3c3.pth' BASE_LOGDIR = '/content/drive/My Drive/kaggle/bengaliai-cv19/logs' NUM_FOLDS = 5 BATCH_SIZE = 64 EPOCHS = 20 SEED = 1234 SIZE = 224 LR = 0.003 HOLD_OUT = False # fix seed set_global_seed(SEED) # read dataset train, _, _ = read_data(DATA_PATH) train_all_images = prepare_image(DATA_PATH, data_type='train', submission=False) # init target_col = ['grapheme_root', 'consonant_diacritic', 'vowel_diacritic'] device = get_device() train_data_transforms = albu.Compose([ albu.ShiftScaleRotate(rotate_limit=10, scale_limit=.1), albu.Cutout(p=0.5), ]) test_data_transforms = None # cross validation kf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED) ids = kf.split(X=train_all_images, y=train[target_col].values) # fold_scores = [] for fold, (train_idx, valid_idx) in enumerate(ids): print("Current Fold: ", fold + 1) logdir = os.path.join(BASE_LOGDIR, 'fold_{}'.format(fold + 1)) os.makedirs(logdir, exist_ok=True) train_df, valid_df = train.iloc[train_idx], train.iloc[valid_idx] print("Train and Valid Shapes are", train_df.shape, valid_df.shape) print("Preparing train datasets....") train_dataset = BengaliAIDataset(images=train_all_images[train_idx], labels=train_df[target_col].values, size=SIZE, transforms=train_data_transforms) print("Preparing valid datasets....") valid_dataset = BengaliAIDataset(images=train_all_images[valid_idx], labels=valid_df[target_col].values, size=SIZE, transforms=test_data_transforms) print("Preparing dataloaders datasets....") train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False) loaders = {'train': train_loader, 'valid': valid_loader} # release memory del train_df, valid_df, train_dataset, valid_dataset gc.collect() torch.cuda.empty_cache() # init models resnet34 = pretrainedmodels.__dict__["resnet34"](pretrained="imagenet") model = BengaliBaselineClassifier(pretrainedmodels=resnet34, hdim=512) # model = BengaliBaselineClassifier(pretrainedmodels=se_resnext50_32x4d(model_path=MODEL_PATH)) # model = CustomEfficientNet.from_pretrained('efficientnet-b3', MODEL_PATH) model = model.to(device) criterions = {'train': BaselineLoss(), 'valid': BaselineLoss()} optimizer = AdamW(model.parameters(), lr=LR) scheduler = OneCycleLRWithWarmup(optimizer, num_steps=EPOCHS, lr_range=(0.001, 0.0001), warmup_steps=1) # catalyst trainer runner = BengaliRunner(device=device) # model training runner.train(model=model, criterions=criterions, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=EPOCHS, score_func=macro_recall) # release memory del model, runner, train_loader, valid_loader, loaders gc.collect() torch.cuda.empty_cache() if HOLD_OUT is True: break return True
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 methond device (Device, optional): device """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) if device is None: device = utils.get_device() model: Model = utils.maybe_recursive_call(model, "to", device=device) if utils.is_wrapped_with_ddp(model): pass elif len(distributed_params) > 0: assert isinstance(model, nn.Module) distributed_rank = distributed_params.pop("rank", -1) syncbn = distributed_params.pop("syncbn", False) if distributed_rank > -1: torch.cuda.set_device(distributed_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") if "opt_level" in distributed_params: utils.assert_fp16_available() from apex import amp amp_result = amp.initialize(model, optimizer, **distributed_params) if optimizer is not None: model, optimizer = amp_result else: model = amp_result if distributed_rank > -1: from apex.parallel import DistributedDataParallel model = DistributedDataParallel(model) if syncbn: from apex.parallel import convert_syncbn_model model = convert_syncbn_model(model) if distributed_rank <= -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) elif torch.cuda.device_count() > 1: if isinstance(model, nn.Module): model = torch.nn.DataParallel(model) elif isinstance(model, dict): model = {k: torch.nn.DataParallel(v) for k, v in model.items()} model: Model = utils.maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def main(): # Enable argument parsing for file paths args = vars(get_args()) train_images_path = args["train_images"] train_masks_path = args["train_masks"] test_images_path = args["test_images"] test_masks_path = args["test_masks"] # print out yaml file configuration dir_path = os.path.dirname(os.path.realpath(__file__)) yaml_path = os.path.join(dir_path, "config/igvc.yaml") ARCH = yaml.safe_load(open(yaml_path, "r")) # Set a seed for reproducibility utils.set_global_seed(ARCH["train"]["seed"]) utils.prepare_cudnn(deterministic=ARCH["train"]["cudnn"]) # Set up U-Net with pretrained EfficientNet backbone model = smp.Unet( encoder_name=ARCH["encoder"]["name"], encoder_weights=ARCH["encoder"]["weight"], classes=ARCH["train"]["classes"], activation=ARCH["encoder"]["activation"], ) # Get Torch loaders loaders = get_loaders( images=np.load(train_images_path), masks=np.load(train_masks_path), image_arr_path=train_images_path, mask_arr_path=train_masks_path, random_state=ARCH["train"]["random_state"], valid_size=ARCH["train"]["valid_size"], batch_size=ARCH["train"]["batch_size"], num_workers=ARCH["train"]["num_workers"], ) # Optimize for cross entropy using Adam criterion = { "CE": CrossentropyND(), } optimizer = AdamW( model.parameters(), lr=ARCH["train"]["lr"], betas=(ARCH["train"]["betas_min"], ARCH["train"]["betas_max"]), eps=float(ARCH["train"]["eps"]), weight_decay=ARCH["train"]["w_decay"], amsgrad=ARCH["train"]["amsgrad"], ) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=ARCH["train"]["optim_factor"], patience=ARCH["train"]["optim_patience"], ) device = utils.get_device() print("Using device: {}".format(device)) print(f"torch: {torch.__version__}, catalyst: {catalyst.__version__}") runner = SupervisedRunner(device=device, input_key="image", input_target_key="mask") # Use Catalyst callbacks for metric calculations during training callbacks = [ CriterionCallback(input_key="mask", prefix="loss", criterion_key="CE"), MulticlassDiceMetricCallback(input_key="mask"), ] # Train and print model training logs runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=ARCH["train"]["logdir"], num_epochs=ARCH["train"]["epochs"], main_metric="loss", minimize_metric=ARCH["train"]["minimize_metric"], fp16=ARCH["train"]["fp16"], verbose=ARCH["train"]["verbose"], ) # Test model on test dataset test_data = SegmentationDataset(test_images_path, test_masks_path) infer_loader = DataLoader( test_data, batch_size=ARCH["test"]["batch_size"], shuffle=ARCH["test"]["shuffle"], num_workers=ARCH["test"]["num_workers"], ) # Get model predictions on test dataset predictions = np.vstack( list( map( lambda x: x["logits"].cpu().numpy(), runner.predict_loader( loader=infer_loader, resume=f"content/full_model2/checkpoints/best.pth", ), ))) save_result(predictions, test_data)
def smart_way(): args = parse_arguments() SEED = args.seed ROOT = Path(args.dataset) img_paths, targets = retrieve_dataset(ROOT) train_transforms = compose( [resize_transforms(), hard_transforms(), post_transforms()]) valid_transforms = compose([pre_transforms(), post_transforms()]) loaders = get_loaders( img_paths=img_paths, targets=targets, random_state=SEED, batch_size=8, train_transforms_fn=train_transforms, valid_transforms_fn=valid_transforms, ) logdir = './table_recognition/nn/regression/logs6/' model = torch.load( f'./table_recognition/nn/segmentation/logs/resnet18_PSPNet/save/best_model.pth' ) model: RegressionFromSegmentation = RegressionFromSegmentation(model) model.to(utils.get_device()) learning_rate = 0.001 encoder_learning_rate = 0.0005 layerwise_params = { "encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003) } model_params = utils.process_model_params( model, layerwise_params=layerwise_params) base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) device = utils.get_device() runner = CustomRunner2(device=device) runner.train(model=model, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=1000, verbose=True, load_best_on_end=True, main_metric='loss') best_model_save_dir = os.path.join(logdir, 'save') os.makedirs(best_model_save_dir, exist_ok=True) torch.save(model, os.path.join( best_model_save_dir, 'best_model.pth')) # save best model (by valid loss) batch = next(iter(loaders["valid"])) try: runner.trace( model=model, batch=batch, logdir=logdir, fp16=False) # optimized version (not all models can be traced) except Exception: pass
def main(): # hyper param # TODO: set your params num_folds = 5 seed = 1234 base_dataset_path = '/content/drive/My Drive/kaggle/google-quest-challenge/dataset' batch_size = 4 num_epochs = 4 bert_model = 'bert-base-uncased' base_logdir = '/kaggle/google_quest/bert' # fix seed set_global_seed(seed) device = get_device() # set up logdir now = datetime.now() base_logdir = os.path.join(base_logdir, now.strftime("%Y%m%d%H%M%S")) os.makedirs(base_logdir, exist_ok=True) # dump this scripts my_file_path = os.path.abspath(__file__) shutil.copyfile(my_file_path, base_logdir) # load dataset # TODO: set your dataset train, test, sample_submission = read_data(base_dataset_path) input_cols = list(train.columns[[1, 2, 5]]) target_cols = list(train.columns[11:]) num_labels = len(target_cols) # init Bert tokenizer = BertTokenizer.from_pretrained(bert_model) # execute CV # TODO: set your CV method kf = GroupKFold(n_splits=num_folds) ids = kf.split(train['question_body'], groups=train['question_body']) fold_scores = [] for fold, (train_idx, valid_idx) in enumerate(ids): print("Current Fold: ", fold + 1) logdir = os.path.join(base_logdir, 'fold_{}'.format(fold + 1)) os.makedirs(logdir, exist_ok=True) # create dataloader train_df, val_df = train.iloc[train_idx], train.iloc[valid_idx] print("Train and Valid Shapes are", train_df.shape, val_df.shape) print("Preparing train datasets....") inputs_train = compute_input_arrays(train_df, input_cols, tokenizer, max_sequence_length=512) outputs_train = compute_output_arrays(train_df, columns=target_cols) lengths_train = np.argmax(inputs_train[0] == 0, axis=1) lengths_train[lengths_train == 0] = inputs_train[0].shape[1] print("Preparing valid datasets....") inputs_valid = compute_input_arrays(val_df, input_cols, tokenizer, max_sequence_length=512) outputs_valid = compute_output_arrays(val_df, columns=target_cols) lengths_valid = np.argmax(inputs_valid[0] == 0, axis=1) lengths_valid[lengths_valid == 0] = inputs_valid[0].shape[1] print("Preparing dataloaders datasets....") train_set = QuestDataset(inputs=inputs_train, lengths=lengths_train, labels=outputs_train) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) valid_set = QuestDataset(inputs=inputs_valid, lengths=lengths_valid, labels=outputs_valid) valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False) # init models model = CustomBertForSequenceClassification.from_pretrained( bert_model, num_labels=num_labels, output_hidden_states=True) criterion = nn.BCEWithLogitsLoss() optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0.05, num_training_steps=num_epochs * len(train_loader)) # model training runner = BertRunner(device=device) loaders = {'train': train_loader, 'valid': valid_loader} print("Model Training....") runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=num_epochs, score_func=mean_spearmanr_correlation_score) # calc valid score best_model_path = os.path.join(logdir, 'best_model.pth') val_preds = runner.predict_loader(model, loaders['valid'], resume=best_model_path) val_truth = train[target_cols].iloc[valid_idx].values # TODO: set your score function cv_score = mean_spearmanr_correlation_score(val_truth, val_preds) print('Fold {} CV score : {}'.format(fold + 1, cv_score)) fold_scores.append(cv_score) return True
def run(config_file, device_id, idx_fold): os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id) print('info: use gpu No.{}'.format(device_id)) config = load_config(config_file) # for n-folds loop if config.data.params.idx_fold == -1: config.data.params.idx_fold = idx_fold config.work_dir = config.work_dir + '_fold{}'.format(idx_fold) elif config.data.params.idx_fold == 0: original_fold = int(config.work_dir.split('_fold')[1]) if original_fold == idx_fold: raise Exception( 'if you specify fold 0, you should use train.py or resume from fold 1.' ) config.data.params.idx_fold = idx_fold config.work_dir = config.work_dir.split('_fold')[0] + '_fold{}'.format( idx_fold) else: raise Exception('you should use train.py if idx_fold is specified.') print('info: training for fold {}'.format(idx_fold)) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( df_path=config.data.train_df_path, data_dir=config.data.train_dir, features=config.data.features, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], horizontal_flip=config.train.horizontal_flip, model_scale=config.data.model_scale, debug=config.debug, pseudo_path=config.data.pseudo_path, ) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder num_features = len(config.data.features) print('info: num_features =', num_features) model = CenterNetFPN( slug=config.model.encoder, num_classes=num_features, ) optimizer = get_optimizer(model, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model, device=get_device()) # train setting criterion, callbacks = get_criterion_and_callback(config) if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [OptimizerCallback(accumulation_steps=accumulation_steps)]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/last_full.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=config.train.fp16, )
def process_components( model: Model, criterion: Criterion = None, optimizer: Optimizer = None, scheduler: Scheduler = None, distributed_params: Dict = None, device: Device = None, ) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]: """ Returns the processed model, criterion, optimizer, scheduler and device Args: model (Model): torch model criterion (Criterion): criterion function optimizer (Optimizer): optimizer scheduler (Scheduler): scheduler distributed_params (dict, optional): dict with the parameters for distributed and FP16 methond device (Device, optional): device """ distributed_params = distributed_params or {} distributed_params = copy.deepcopy(distributed_params) distributed_params.update(get_distributed_params()) if device is None: device = utils.get_device() use_apex = distributed_params.pop("apex", True) and is_apex_available() model: Model = utils.maybe_recursive_call(model, "to", device=device) if utils.is_wrapped_with_ddp(model): pass # distributed data parallel run (ddp) (with apex support) elif get_rank() >= 0: assert isinstance(model, nn.Module), \ "No support for dixtributed KV model yet" local_rank = distributed_params.pop("local_rank", 0) device = f"cuda:{local_rank}" model = utils.maybe_recursive_call(model, "to", device=device) syncbn = distributed_params.pop("syncbn", False) if use_apex: import apex model, optimizer = initialize_apex(model, optimizer, **distributed_params) model = apex.parallel.DistributedDataParallel(model) if syncbn: model = apex.parallel.convert_syncbn_model(model) else: model = nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) # data parallel run (dp) (with apex support) else: # apex issue https://github.com/deepset-ai/FARM/issues/210 can_use_apex = \ (use_apex and torch.cuda.device_count() == 1) \ or ( torch.cuda.device_count() > 1 and distributed_params.get("opt_level", "O0") == "O1" ) if can_use_apex: assert isinstance(model, nn.Module), \ "No support for apex KV model yet" model, optimizer = initialize_apex(model, optimizer, **distributed_params) if torch.cuda.device_count() > 1: if isinstance(model, nn.Module): model = nn.DataParallel(model) elif isinstance(model, dict): model = {k: nn.DataParallel(v) for k, v in model.items()} model: Model = utils.maybe_recursive_call(model, "to", device=device) return model, criterion, optimizer, scheduler, device
def simple_way(): args = parse_arguments() SEED = args.seed ROOT = Path(args.dataset) img_paths, targets = retrieve_dataset(ROOT) train_transforms = compose( [resize_transforms(), hard_transforms(), post_transforms()]) valid_transforms = compose([pre_transforms(), post_transforms()]) loaders = get_loaders( img_paths=img_paths, targets=targets, random_state=SEED, batch_size=8, train_transforms_fn=train_transforms, valid_transforms_fn=valid_transforms, ) logdir = './table_recognition/nn/regression/logs5/' # model = models.resnet18(pretrained=True) # model.fc = nn.Linear(model.fc.in_features, 8) model = Net(models.resnet18(pretrained=True)) # model = torch.load(f'{logdir}/save/best_model.pth') # model.to('cpu') # for batch in loaders['valid']: # tables = model(batch['image']) # for image, table in zip(batch['image'], tables): # image = utils.tensor_to_ndimage(image) # image = (image * 255 + 0.5).astype(int).clip(0, 255).astype('uint8') # img_size = image.shape[:2] # table = get_canonical_4_polygon(table) # mask = find_convex_hull_mask( # img_size, # [(int(x * img_size[0]), # int(y * img_size[1])) # for x, y in table] # ).astype(bool) # # plt.figure(figsize=(14, 10)) # plt.subplot(1, 2, 1) # plt.imshow(image) # plt.subplot(1, 2, 2) # plt.imshow(mask) # plt.show() learning_rate = 0.001 encoder_learning_rate = 0.0005 optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate, weight_decay=0.00003) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) device = utils.get_device() # runner = SupervisedRunner(device=device, input_key='image', input_target_key='table') runner = CustomRunner(device=device) runner.train(model=model, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=50, verbose=True, load_best_on_end=True, main_metric='loss') best_model_save_dir = os.path.join(logdir, 'save') os.makedirs(best_model_save_dir, exist_ok=True) torch.save(model, os.path.join( best_model_save_dir, 'best_model.pth')) # save best model (by valid loss) batch = next(iter(loaders["valid"])) try: runner.trace( model=model, batch=batch, logdir=logdir, fp16=False) # optimized version (not all models can be traced) except Exception: pass
def main(train, test, features, target): # get args args = parse_arguments() params = yaml_to_json(args.yaml_path) # hyper param num_folds = params.fold seed = params.seed base_path = params.base_path target_cols = params.target features_cols = params.features preprocessed_data_path = params.preprocessed_data batch_size = params.batch_size num_epochs = params.epochs # ex) '/hoge/logs' base_logdir = params.base_logdir # fix seed set_global_seed(seed) device = get_device() # set up logdir now = datetime.now() base_logdir = os.path.join(base_logdir + now.strftime("%Y%m%d%H%M%S")) os.makedirs(base_logdir, exist_ok=True) # dump yaml contents with open(os.path.join(base_logdir, 'params.json'), mode="w") as f: json.dump(params, f, indent=4) # dump this scripts my_file_path = os.path.abspath(__file__) shutil.copyfile(my_file_path, base_logdir) # load dataset if preprocessed_data_path == '': train, test, sample_submission = read_data(base_path) # noqa # TODO: You should implement these function!! train, test = preprocess(train, test) # noqa train, test = build_feature(train, test) # noqa else: train = pd.read_csv(preprocessed_data_path + 'train.csv') test = pd.read_csv(preprocessed_data_path + 'test.csv') sample_submission = pd.read_csv(preprocessed_data_path + 'sample_submission.csv') # execute CV # TODO: set your CV method kf = KFold(n_splits=num_folds, random_state=seed) ids = kf.split(train) fold_scores = [] test_preds = [] for fold, (train_idx, valid_idx) in enumerate(ids): print('Fold {}'.format(fold + 1)) logdir = os.path.join(base_logdir + 'fold_{}'.format(fold + 1)) os.makedirs(logdir, exist_ok=True) # data X_train = train[features_cols] # 目的変数の正規化は...? Y_train = train[target_cols] X_test = train[features_cols] # create dataloaders train_dls, test_dl = create_data_loader( X_train.iloc[train_idx].to_numpy(), Y_train.iloc[train_idx].to_numpy(), X_train.iloc[valid_idx].to_numpy(), Y_train.iloc[valid_idx].to_numpy(), X_test.to_numpy(), batch_size=batch_size) # init models # TODO: set your model and learning condition # ここは関数を用意して、キーワードで取り出すようにできると汎用性は上がる model = SampleNN(input_dim=1000, out_dim=1) criterion = nn.BCELoss() optimizer = torch.optim.AdamW(model.parameters()) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) # init catalyst runner runner = SupervisedRunner(device=device) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=train_dls, logdir=logdir, num_epochs=num_epochs, callbacks=[EarlyStoppingCallback(patience=15, min_delta=0)], verbose=False) # calculate valid score best_model_path = logdir + '/checkpoints/best.pth' val_preds = runner.predict_loader(model, train_dls['valid'], resume=best_model_path, verbose=False) val_truth = Y_train.iloc[valid_idx].values # TODO: set your score function cv_score = mean_spearmanr_correlation_score(val_truth, val_preds) print('Fold {} CV score : {}'.format(fold + 1, cv_score)) fold_scores.append(cv_score) # test prediction test_pred = runner.predict_loader( model, test_dl, resume=best_model_path, verbose=False) / num_folds test_preds.append(test_pred) # submit # TODO: set your submit process sample_submission[target_cols] = np.mean(test_preds, axis=0) sample_submission.to_csv('submission.csv') return True
def run(config_file): config = load_config(config_file) #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) #save the configuration to the working dir if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') #Enter the GPUS you have, os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) #our dataset has an explicit validation folder, use that later. all_transforms['valid'] = get_transforms(config.transforms.test) print("before rajat config", config.data.height, config.data.width) #fetch the dataloaders we need dataloaders = { phase: make_loader(data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug) for phase in ['train', 'valid'] } #creating the segmentation model with pre-trained encoder ''' dumping the parameters for smp library encoder_name: str = "resnet34", encoder_depth: int = 5, encoder_weights: str = "imagenet", decoder_use_batchnorm: bool = True, decoder_channels: List[int] = (256, 128, 64, 32, 16), decoder_attention_type: Optional[str] = None, in_channels: int = 3, classes: int = 1, activation: Optional[Union[str, callable]] = None, aux_params: Optional[dict] = None, ''' model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) #fetch the loss criterion = get_loss(config) params = [ { 'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr }, { 'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr }, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) ''' dumping the catalyst supervised runner https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py model (Model): Torch model object device (Device): Torch device input_key (str): Key in batch dict mapping for model input output_key (str): Key in output dict model output will be stored under input_target_key (str): Key in batch dict mapping for target ''' runner = SupervisedRunner(model=model, device=get_device()) #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks callbacks = [DiceCallback(), IouCallback()] #adding patience if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) #thanks for handling the distributed training ''' we are gonna take zero_grad after accumulation accumulation_steps ''' if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend([ CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps) ]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) ''' pudae добавь пожалуйста обратный вызов https://arxiv.org/pdf/1710.09412.pdf **srk adding the mixup callback ''' if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) '''@rajat implemented cutmix, a wieghed combination of cutout and mixup ''' callbacks.append(MixupCallback()) callbacks.append(CutMixCallback()) ''' rajat introducing training loop https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py take care of the nvidias fp16 precision ''' print(config.work_dir) print(config.train.minimize_metric) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=False, )
def train_segmentation_model( model: torch.nn.Module, logdir: str, num_epochs: int, loaders: Dict[str, DataLoader] ): criterion = { "dice": DiceLoss(), "iou": IoULoss(), "bce": nn.BCEWithLogitsLoss() } learning_rate = 0.001 encoder_learning_rate = 0.0005 layerwise_params = {"encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003)} model_params = utils.process_model_params(model, layerwise_params=layerwise_params) base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) device = utils.get_device() runner = SupervisedRunner(device=device, input_key='image', input_target_key='mask') callbacks = [ CriterionCallback( input_key="mask", prefix="loss_dice", criterion_key="dice" ), CriterionCallback( input_key="mask", prefix="loss_iou", criterion_key="iou" ), CriterionCallback( input_key="mask", prefix="loss_bce", criterion_key="bce" ), MetricAggregationCallback( prefix="loss", mode="weighted_sum", metrics={"loss_dice": 1.0, "loss_iou": 1.0, "loss_bce": 0.8}, ), # metrics DiceCallback(input_key='mask'), IouCallback(input_key='mask'), ] runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=callbacks, logdir=logdir, num_epochs=num_epochs, main_metric="iou", minimize_metric=False, verbose=True, load_best_on_end=True, ) best_model_save_dir = os.path.join(logdir, 'save') os.makedirs(best_model_save_dir, True) torch.save(model, os.path.join(best_model_save_dir, 'best_model.pth')) # save best model (by valid loss) batch = next(iter(loaders["valid"])) try: runner.trace(model=model, batch=batch, logdir=logdir, fp16=False) # optimized version (not all models can be traced) except Exception: pass
from catalyst.utils import get_device from catalyst.dl.runner import SupervisedRunner from catalyst.dl.callbacks import DiceCallback, EarlyStoppingCallback from utils.dataset import CustomDataset from utils.augmentation import get_validation_augmentation, get_training_augmentation from utils.losses import WeightedBCEDiceLoss from utils.callbacks import CometCallback from models.EffUNet import EffUNet os.environ["CUDA_VISIBLE_DEVICES"] = ... device = get_device() hyper_params = { "in_channels": ..., "num_classes": ..., "batch_size": ..., "num_epochs": ..., "learning_rate": 1e-3, "lambda_dice": 0.5, "lambda_bceWithLogits": 1.5, "logdir": ... } experiment = Experiment(...) experiment.log_parameters(hyper_params)
def run(config_file): config = load_config(config_file) if 'COLAB_GPU' in os.environ: config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir elif 'KAGGLE_WORKING_DIR' in os.environ: config.work_dir = '/kaggle/working/' + config.work_dir print('working directory:', config.work_dir) if not os.path.exists(config.work_dir): os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, img_size=(config.data.height, config.data.width), batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, debug=config.debug ) for phase in ['train', 'valid'] } # create segmentation model with pre trained encoder model = getattr(smp, config.model.arch)( encoder_name=config.model.encoder, encoder_weights=config.model.pretrained, classes=config.data.num_classes, activation=None, ) # train setting criterion = get_loss(config) params = [ {'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr}, {'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr}, ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model, device=get_device()) callbacks = [DiceCallback(), IouCallback()] if config.train.early_stop_patience > 0: callbacks.append(EarlyStoppingCallback( patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [CriterionCallback(), OptimizerCallback(accumulation_steps=accumulation_steps)] ) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback( resume=config.work_dir + '/checkpoints/last_full.pth')) if config.train.mixup: callbacks.append(MixupCallback()) if config.train.cutmix: callbacks.append(CutMixCallback()) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric=config.train.main_metric, minimize_metric=config.train.minimize_metric, callbacks=callbacks, verbose=True, fp16=True, )