Exemplo n.º 1
0
def run(logdir_suffix: str = '',
        device: str = None,
        check: bool = False) -> dict:

    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    # run experiment
    RunnerClass = SupervisedRunner if check else SupervisedWandbRunner
    runner = RunnerClass(
        device=device,
        input_key="images",
        output_key=["features"] +
        ["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),
    )
    experiment = Experiment(logdir='./logs' + logdir_suffix)
    runner.run_experiment(experiment, check=check)

    return {
        'runner': runner,
        'experiment': experiment,
    }
Exemplo n.º 2
0
def run(name: str = None,
        config: dict = None,
        device: str = None,
        check: bool = False) -> dict:
    config = config or experiment_config
    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    # inititalize weigths & biases
    name = name or '_'.join(
        filter(None,
               [experiment_name, f"{datetime.datetime.now():%Y-%m-%d-%S}"]))

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    # run experiment
    runner = SupervisedRunner(
        device=device,
        input_key="images",
        output_key=["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),
    )
    experiment = Experiment(config)
    runner.run_experiment(experiment, check=check)

    return {
        'runner': runner,
        'experiment': experiment,
        'config': config,
    }
Exemplo n.º 3
0
    def _run_epoch(self, loaders):
        # @TODO: better solution with train/inference handling ?
        if not self.state.stage.startswith("infer"):
            assert self.state.valid_loader in loaders.keys(), \
                f"'{self.state.valid_loader}' " \
                f"should be in provided loaders: {list(loaders.keys())}"
        else:
            assert not any(x.startswith("train") for x in loaders.keys()), \
                "for inference no train loader should be passed"

        for loader_name, loader in loaders.items():
            self.state.loader_name = loader_name
            self.state.loader_len = len(loader)
            self.state.need_backward = loader_name.startswith("train")
            utils.maybe_recursive_call(self.model,
                                       "train",
                                       mode=self.state.need_backward)

            if isinstance(loader.sampler, DistributedSampler) \
                    and loader_name.startswith("train"):
                loader.sampler.set_epoch(self.state.stage_epoch)

            utils.set_global_seed(self.experiment.initial_seed +
                                  self.state.epoch + 1)
            self._run_event("loader_start")
            with torch.set_grad_enabled(self.state.need_backward):
                self._run_loader(loader)
            self._run_event("loader_end")
Exemplo n.º 4
0
def run(config: dict = None,
        logdir_suffix: str = '',
        device: str = None,
        check: bool = False) -> dict:

    config = config or experiment_config
    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    config['monitoring_params']['name'] = EXPERIMENT_NAME
    config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE
    config['args']['logdir'] += logdir_suffix

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    # run experiment
    RunnerClass = SupervisedRunner if check else SupervisedWandbRunner
    runner = RunnerClass(
        device=device,
        input_key="images",
        output_key=["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),
    )
    experiment = Experiment(config)
    runner.run_experiment(experiment, check=check)

    return {
        'runner': runner,
        'experiment': experiment,
        'config': config,
    }
Exemplo n.º 5
0
    def _get_experiment_components(
        self,
        stage: str = None
    ) -> Tuple[_Model, _Criterion, _Optimizer, _Scheduler, torch.device]:
        """
        Inner method for children's classes for model specific initialization.
        As baseline, checks device support and puts model on it.
        :return:
        """

        utils.set_global_seed(self.experiment.initial_seed)
        model = self.experiment.get_model(stage)
        criterion, optimizer, scheduler = \
            self.experiment.get_experiment_components(model, stage)

        model, criterion, optimizer, scheduler, device = \
            utils.process_components(
                model=model,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                distributed_params=self.experiment.distributed_params
            )

        return model, criterion, optimizer, scheduler, device
Exemplo n.º 6
0
def run(name: str = None,
        config: dict = None,
        device: str = None,
        check: bool = False) -> dict:
    config = config or experiment_config
    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    config['monitoring_params']['name'] = EXPERIMENT_NAME

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    # run experiment
    runner = SupervisedRunner(
        device=device,
        input_key="images",
        output_key=["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),
    )
    experiment = Experiment(config)
    runner.run_experiment(experiment, check=check)

    return {
        'runner': runner,
        'experiment': experiment,
        'config': config,
    }
Exemplo n.º 7
0
    def _prepare_for_stage(self, stage: str):
        super()._prepare_for_stage(stage=stage)

        # @TODO: remove this trick
        utils.set_global_seed(self.experiment.initial_seed)
        loaders = self.experiment.get_loaders(stage=stage)
        self.loaders = loaders
Exemplo n.º 8
0
def run(max_lr: float = 1e-1, steps_per_epoch: int = 1413, device: str = None, check: bool = False) -> dict:
    config = copy.deepcopy(experiment_config)
    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    config['monitoring_params']['name'] = EXPERIMENT_NAME
    config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE

    # add scheduler to config
    config["stages"]["scheduler_params"] = {
        "scheduler": "OneCycleLR",
        "max_lr": max_lr,
        "epochs": config["stages"]["state_params"]["num_epochs"],
        "steps_per_epoch": steps_per_epoch,
        "div_factor": 200,
        "final_div_factor": 1e5,
    }
    experiment = Experiment(config)

    # run experiment
    runner = SupervisedWandbRunner(
        device=device,
        input_key="images",
        output_key=["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),)

    runner.run_experiment(experiment, check=check)

    return experiment, runner
Exemplo n.º 9
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp
    expdir = Path(args.expdir)

    # optuna objective
    def objective(trial: optuna.trial):
        trial, trial_config = _process_trial_config(trial, config.copy())
        experiment, runner, trial_config = utils.prepare_config_api_components(
            expdir=expdir, config=trial_config)
        # @TODO: here we need better solution.
        experiment._trial = trial  # noqa: WPS437

        if experiment.logdir is not None and utils.get_rank() <= 0:
            utils.dump_environment(trial_config, experiment.logdir,
                                   args.configs)
            utils.dump_code(args.expdir, experiment.logdir)

        runner.run_experiment(experiment)

        return runner.best_valid_metrics[runner.main_metric]

    # optuna direction
    direction = ("minimize" if config.get("stages", {}).get(
        "stage_params", {}).get("minimize_metric", True) else "maximize")

    # optuna sampler
    sampler_params = config.pop("optuna_sampler_params", {})
    optuna_sampler_type = sampler_params.pop("sampler", None)
    optuna_sampler = (optuna.samplers.__dict__[optuna_sampler_type](
        **sampler_params) if optuna_sampler_type is not None else None)

    # optuna pruner
    pruner_params = config.pop("optuna_pruner_params", {})
    optuna_pruner_type = pruner_params.pop("pruner", None)
    optuna_pruner = (optuna.pruners.__dict__[optuna_pruner_type](
        **pruner_params) if optuna_pruner_type is not None else None)

    study = optuna.create_study(
        direction=direction,
        storage=args.storage,
        study_name=args.study_name,
        sampler=optuna_sampler,
        pruner=optuna_pruner,
    )
    study.optimize(
        objective,
        n_trials=args.n_trials,
        timeout=args.timeout,
        n_jobs=args.n_jobs or 1,
        gc_after_trial=args.gc_after_trial,
        show_progress_bar=args.show_progress_bar,
    )
Exemplo n.º 10
0
    def predict_loader(
        self,
        *,
        loader: DataLoader,
        model: Model = None,
        resume: str = None,
        fp16: Union[Dict, bool] = None,
        initial_seed: int = 42,
    ) -> Generator:
        """
        Runs model inference on PyTorch Dataloader and returns
        python generator with model predictions from `runner.predict_batch`.
        Cleans up the experiment info to avoid possible collisions.
        Sets `is_train_loader` and `is_valid_loader` to `False` while
        keeping `is_infer_loader` as True. Moves model to evaluation mode.

        Args:
            loader: loader to predict
            model: model to use for prediction
            resume: path to checkpoint to resume
            fp16 (Union[Dict, bool]): fp16 usage flag
            initial_seed: seed to use before prediction

        Yields:
            bathes with model predictions
        """
        if isinstance(fp16, bool) and fp16:
            fp16 = {"opt_level": "O1"}

        if model is not None:
            self.model = model
        assert self.model is not None

        if resume is not None:
            checkpoint = utils.load_checkpoint(resume)
            utils.unpack_checkpoint(checkpoint, model=self.model)

        self.experiment = None
        utils.set_global_seed(initial_seed)
        (model, _, _, _, device) = utils.process_components(  # noqa: WPS122
            model=self.model,
            distributed_params=fp16,
            device=self.device,
        )
        self._prepare_inner_state(
            stage="infer",
            model=model,
            device=device,
            is_train_loader=False,
            is_valid_loader=False,
            is_infer_loader=True,
        )
        utils.maybe_recursive_call(self.model, "train", mode=False)

        utils.set_global_seed(initial_seed)
        for batch in loader:
            yield self.predict_batch(batch)
Exemplo n.º 11
0
    def predict_loader(
        self,
        *,
        loader: DataLoader,
        model: Model = None,
        resume: str = None,
        fp16: Union[Dict, bool] = None,
        initial_seed: int = 42,
    ) -> Generator:
        """
        Runs model inference on PyTorch Dataloader and returns
        python Generator with model predictions from `runner.predict_batch`

        Args:
            loader (DataLoader): loader to predict
            model (Model): model to use for prediction
            resume (str): path to checkpoint to resume
            fp16 (Union[Dict, bool]): fp16 usage flag
            initial_seed (int): seed to use before prediction

        Yields:
            bathes with model predictions
        """
        if isinstance(fp16, bool) and fp16:
            fp16 = {"opt_level": "O1"}

        if model is not None:
            self.model = model
        assert self.model is not None

        if resume is not None:
            checkpoint = utils.load_checkpoint(resume)
            utils.unpack_checkpoint(checkpoint, model=self.model)

        (  # noqa: WPS122
            self.model,
            _,
            _,
            _,
            self.device,
        ) = utils.process_components(
            model=self.model,
            distributed_params=fp16,
            device=self.device,
        )

        utils.set_global_seed(initial_seed)
        for batch in loader:
            yield self.predict_batch(batch)
Exemplo n.º 12
0
def run(config: dict = None,
        model_filepath: str = None,
        logdir_suffix: str = '_' + EXPERIMENT_NAME,
        max_lr: float = 1e-1,
        steps_per_epoch: int = 1413,
        device: str = None,
        check: bool = False) -> dict:

    config = config or experiment_config
    device = device or utils.get_device()
    print(f"device: {device}")

    utils.set_global_seed(SEED)

    config['monitoring_params']['name'] = EXPERIMENT_NAME
    config['stages']['state_params']['checkpoint_data']['image_size'] = SIZE
    config['args']['logdir'] += logdir_suffix

    # convert parquet ot zip
    parquet_to_images(TRAIN, ZIP_TRAIN_FILE, SIZE)
    parquet_to_images(TEST, ZIP_TEST_FILE, SIZE)

    # add scheduler to config
    config["stages"]["scheduler_params"] = {
        "scheduler": "OneCycleLR",
        "max_lr": max_lr,
        "epochs": config["stages"]["state_params"]["num_epochs"],
        "steps_per_epoch": steps_per_epoch,
        "div_factor": 500,
        "final_div_factor": 1e5,
        "max_momentum": 0.999
    }

    # run experiment
    RunnerClass = SupervisedRunner if check else SupervisedWandbRunner
    runner = RunnerClass(
        device=device,
        input_key="images",
        output_key=["logit_" + c for c in output_classes.keys()],
        input_target_key=list(output_classes.keys()),
    )
    experiment = Experiment(config, model_filepath)
    runner.run_experiment(experiment, check=check)

    return {
        'runner': runner,
        'experiment': experiment,
        'config': config,
    }
Exemplo n.º 13
0
def main(args, unknown_args):
    """Run the ``catalyst-dl run`` script"""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir))

    runner_params = config.pop("runner_params", {}) or {}
    experiment = Experiment(config)
    runner = Runner(**runner_params)

    if experiment.logdir is not None:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    check_run = safitty.get(config, "args", "check", default=False)
    runner.run_experiment(experiment, check=check_run)
Exemplo n.º 14
0
def main_worker(args, unknown_args):
    """Runs main worker thread from model training."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex
    config.setdefault("distributed_params", {})["amp"] = args.amp

    experiment, runner, config = utils.prepare_config_api_components(
        expdir=Path(args.expdir), config=config
    )

    if experiment.logdir is not None and utils.get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 15
0
def main_worker(args, unknown_args):
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    Experiment, Runner = utils.import_experiment_and_runner(Path(args.expdir))

    runner_params = config.get("runner_params", {})
    experiment = Experiment(config)
    runner = Runner(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 16
0
def main(args, _=None):
    """Run the ``catalyst-data image2embeddings`` script."""
    global IMG_SIZE

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    IMG_SIZE = (args.img_size, args.img_size)  # noqa: WPS442

    if args.traced_model is not None:
        device = utils.get_device()
        model = torch.jit.load(str(args.traced_model), map_location=device)
    else:
        model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
        model = model.eval()
        model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          rootpath=args.rootpath)

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer,
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            batch_features = model(batch["image"].to(device))
            batch_features = batch_features.cpu().detach().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
Exemplo n.º 17
0
    def _prepare_for_stage(self, stage: str):
        utils.set_global_seed(self.experiment.initial_seed)
        migrating_params = {}
        if self.state is not None:
            migrating_params.update({
                "step": self.state.step,
                "epoch": self.state.epoch + 1
            })

        self.model, criterion, optimizer, scheduler, self.device = \
            self._get_experiment_components(stage)

        self.state = RunnerState(stage=stage,
                                 model=self.model,
                                 device=self.device,
                                 criterion=criterion,
                                 optimizer=optimizer,
                                 scheduler=scheduler,
                                 **self.experiment.get_state_params(stage),
                                 **migrating_params)
        utils.set_global_seed(self.experiment.initial_seed)
Exemplo n.º 18
0
def main_worker(args, unknown_args):
    """@TODO: Docs. Contribution is welcome."""
    args, config = utils.parse_args_uargs(args, unknown_args)
    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    config.setdefault("distributed_params", {})["apex"] = args.apex

    experiment_fn, runner_fn = utils.import_experiment_and_runner(
        Path(args.expdir))
    if experiment_fn is None:
        experiment_params = config.get("experiment_params", {})
        experiment = experiment_params.get("experiment", "Experiment")
        experiment_fn = EXPERIMENTS.get(experiment)

    runner_params = config.get("runner_params", {})
    experiment = experiment_fn(config)
    runner = runner_fn(**runner_params)

    if experiment.logdir is not None and get_rank() <= 0:
        utils.dump_environment(config, experiment.logdir, args.configs)
        utils.dump_code(args.expdir, experiment.logdir)

    runner.run_experiment(experiment)
Exemplo n.º 19
0
    parser.add_argument('--sample', default=0, type=int)
    params = parser.parse_args()

    import torch
    from torch.utils.data import DataLoader
    from torchvision import transforms
    from catalyst.dl import SupervisedRunner
    from catalyst.dl.utils import set_global_seed, prepare_cudnn
    from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, PrecisionRecallF1ScoreCallback

    from .dataset import BIOMETRY
    from .model import *
    from .transform import Normalize, ToTensor

    # Seed & CUDA deterministic
    set_global_seed(params.seed)
    prepare_cudnn(deterministic=params.deterministic)

    # Init custom transforms
    transform = transforms.Compose([
        Normalize(params.sample == 0),
        ToTensor(),
    ])

    # Init custom dataset
    data_dir = DIR_DATA_PROCESSED.joinpath('BIOMETRY')
    traindir = data_dir.joinpath('train').as_posix()
    validdir = data_dir.joinpath('valid').as_posix()
    train_dataset = BIOMETRY(traindir, transform=transform)
    valid_dataset = BIOMETRY(traindir, transform=transform)
Exemplo n.º 20
0
def post_transforms():
    # we use ImageNet image normalization
    # and convert it to torch.Tensor
    return [A.Normalize(p=1.0), ToTensorV2(p=1.0), ]


if __name__ == "__main__":
    warnings.simplefilter("ignore", UserWarning)
    warnings.simplefilter("ignore", DeprecationWarning)
    warnings.filterwarnings('ignore')
    os.environ["PYTHONWARNINGS"] = "ignore"
    config = ConfigExperiment()
    config.size = EfficientNet.get_image_size(config.model_name)
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    utils.set_global_seed(config.seed)
    utils.prepare_cudnn(deterministic=True)


    train_transforms = plant.compose([
        pre_transforms(config.size),
        hard_transforms(),
        post_transforms()
    ])
    valid_transforms = plant.compose([
        pre_transforms(config.size),
        post_transforms()
    ])

    show_transforms = plant.compose([
        pre_transforms(config.size),
 def _worker_init_fn(self, x):
     # can not be lambda if we want to run num_workers > 0 on windows
     set_global_seed(self.initial_seed + x)
Exemplo n.º 22
0
def main():
    args = get_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    SEED = 42
    utils.set_global_seed(SEED)
    utils.prepare_cudnn(deterministic=True)
    num_classes = 14

    #define datasets
    train_dataset = ChestXrayDataSet(
        data_dir=args.path_to_images,
        image_list_file=args.train_list,
        transform=transforms_train,
    )

    val_dataset = ChestXrayDataSet(
        data_dir=args.path_to_images,
        image_list_file=args.val_list,
        transform=transforms_val,
    )

    loaders = {
        'train':
        DataLoader(train_dataset,
                   batch_size=args.batch_size,
                   shuffle=True,
                   num_workers=args.num_workers),
        'valid':
        DataLoader(val_dataset,
                   batch_size=2,
                   shuffle=False,
                   num_workers=args.num_workers)
    }

    logdir = args.log_dir  #where model weights and logs are stored

    #define model
    model = DenseNet121(num_classes)
    if len(args.gpus) > 1:
        model = nn.DataParallel(model)
    device = utils.get_device()
    runner = SupervisedRunner(device=device)

    optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0003)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.25,
                                                     patience=2)

    weights = torch.Tensor(
        [10, 100, 30, 8, 40, 40, 330, 140, 35, 155, 110, 250, 155,
         200]).to(device)
    criterion = BCEWithLogitsLoss(pos_weight=weights)

    class_names = [
        'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass',
        'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema',
        'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'
    ]

    runner.train(
        model=model,
        logdir=logdir,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        num_epochs=args.epochs,

        # We can specify the callbacks list for the experiment;
        # For this task, we will check AUC and accuracy
        callbacks=[
            AUCCallback(
                input_key="targets",
                output_key='logits',
                prefix='auc',
                class_names=class_names,
                num_classes=num_classes,
                activation='Sigmoid',
            ),
            AccuracyCallback(
                input_key="targets",
                output_key="logits",
                prefix="accuracy",
                accuracy_args=[1],
                num_classes=14,
                threshold=0.5,
                activation='Sigmoid',
            ),
        ],
        main_metric='auc/_mean',
        minimize_metric=False,
        verbose=True,
    )
Exemplo n.º 23
0
    def _prepare_for_stage(self, stage: str):
        super()._prepare_for_stage(stage=stage)

        utils.set_global_seed(self.experiment.initial_seed)
        loaders = self.experiment.get_loaders(stage=stage)
        self.state.loaders = loaders
Exemplo n.º 24
0
    def get_loaders(self, stage: str) -> "OrderedDict[str, DataLoader]":
        """Returns the loaders for a given stage"""
        data_params = dict(self.stages_config[stage]["data_params"])

        batch_size = data_params.pop("batch_size", 1)
        num_workers = data_params.pop("num_workers")
        drop_last = data_params.pop("drop_last", False)
        per_gpu_scaling = data_params.pop("per_gpu_scaling", False)
        distributed_rank = self.distributed_params.get("rank", -1)
        distributed = distributed_rank > -1

        datasets = self.get_datasets(stage=stage, **data_params)

        overridden_loaders_params = data_params.pop("loaders_params", {})
        assert isinstance(overridden_loaders_params, dict), \
            f"{overridden_loaders_params} should be Dict"

        loaders = OrderedDict()
        for name, ds_ in datasets.items():
            assert isinstance(ds_, (Dataset, dict)), \
                f"{ds_} should be Dataset or Dict"

            overridden_loader_params = overridden_loaders_params.pop(name, {})
            assert isinstance(overridden_loader_params, dict), \
                f"{overridden_loader_params} should be Dict"

            batch_size = overridden_loader_params.pop("batch_size", batch_size)
            num_workers = overridden_loader_params.\
                pop("num_workers", num_workers)

            if per_gpu_scaling and not distributed:
                num_gpus = max(1, torch.cuda.device_count())
                batch_size *= num_gpus
                num_workers *= num_gpus

            loader_params = {
                "batch_size": batch_size,
                "num_workers": num_workers,
                "pin_memory": torch.cuda.is_available(),
                "drop_last": drop_last,
                **overridden_loader_params
            }

            if isinstance(ds_, Dataset):
                loader_params["dataset"] = ds_
            elif isinstance(ds_, dict):
                assert "dataset" in ds_, \
                    "You need to specify dataset for dataloader"
                loader_params = utils.merge_dicts(ds_, loader_params)
            else:
                raise NotImplementedError

            if distributed:
                sampler = loader_params.get("sampler")
                if sampler is not None:
                    assert isinstance(sampler, DistributedSampler)
                else:
                    loader_params["sampler"] = DistributedSampler(
                        dataset=loader_params["dataset"])

            loader_params["shuffle"] = (name.startswith("train") and
                                        loader_params.get("sampler") is None)

            if "batch_sampler" in loader_params:
                if distributed:
                    raise ValueError("batch_sampler option is mutually "
                                     "exclusive with distributed")

                for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                    loader_params.pop(k, None)

            if "worker_init_fn" not in loader_params:
                loader_params["worker_init_fn"] = \
                    lambda x: utils.set_global_seed(self.initial_seed + x)

            loaders[name] = DataLoader(**loader_params)

        return loaders
Exemplo n.º 25
0
from catalyst.dl import utils
SEED = 42
utils.set_global_seed(SEED)
utils.prepare_cudnn(deterministic=True)

import its_training_utils as tu
import numpy as np
import pandas as pd
from datetime import datetime
import torch
from torch import nn
import os
import json
from sklearn.model_selection import train_test_split
import cv2
from collections import OrderedDict
from catalyst import dl
from catalyst.core import Callback, CallbackOrder
from catalyst.dl.callbacks import AccuracyCallback, CheckpointCallback, AUCCallback, CriterionCallback, MetricAggregationCallback, MeterMetricsCallback, VerboseLogger, SchedulerCallback, OptimizerCallback, MixupCallback
from catalyst.dl.callbacks.metrics.iou import IouCallback
from catalyst.utils.checkpoint import load_checkpoint, unpack_checkpoint
from albumentations.pytorch.transforms import ToTensor
import base64
from tqdm import tqdm
import torchvision.models as models
from catalyst.contrib.nn.optimizers.radam import RAdam
from catalyst.contrib.nn.optimizers.lookahead import Lookahead
from sklearn.model_selection import KFold, StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import albumentations as albu
from efficientnet_pytorch import EfficientNet
Exemplo n.º 26
0
def main(args, _=None):
    """Run the ``catalyst-data text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    if hasattr(args, "in_huggingface"):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if hasattr(args, "in_model"):
        checkpoint = utils.load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            bert_output = model(**batch)
            mask = (batch["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if utils.check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            features_ = process_bert_output(
                bert_output=bert_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for key, value in features_.items():
                    name_ = key if isinstance(key, str) else f"{key:02d}"
                    _, embedding_size = value.shape
                    features[name_] = np.memmap(
                        f"{args.out_prefix}.{name_}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for key, value in features_.items():
                name_ = key if isinstance(key, str) else f"{key:02d}"
                features[name_][indices] = _detach(value)
Exemplo n.º 27
0
def main(args, _=None):
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    model_config = BertConfig.from_pretrained(args.in_config)
    model_config.output_hidden_states = args.output_hidden_states
    model = BertModel(config=model_config)

    checkpoint = utils.load_checkpoint(args.in_model)
    checkpoint = {"model_state_dict": checkpoint}
    utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    tokenizer = BertTokenizer.from_pretrained(args.in_vocab)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=get_features,
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    poolings = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            features_ = model(**batch)

            # create storage based on network output
            if idx == 0:
                # class
                _, embedding_size = features_[1].shape
                features["class"] = np.memmap(
                    f"{args.out_prefix}.class.npy",
                    dtype=np.float32,
                    mode="w+",
                    shape=(num_samples, embedding_size),
                )
                if args.output_hidden_states:
                    # all embeddings
                    for i, feature_ in enumerate(features_[2]):
                        name_ = f"embeddings_{i + 1:02d}"
                        _, _, embedding_size = feature_.shape
                        poolings[name_] = LamaPooling(
                            features_in=embedding_size,
                            groups=pooling_groups,
                        )
                        features[name_] = np.memmap(
                            f"{args.out_prefix}.{name_}.npy",
                            dtype=np.float32,
                            mode="w+",
                            shape=(num_samples, embedding_size),
                        )
                else:
                    # last
                    _, _, embedding_size = features_[0].shape
                    poolings["last"] = LamaPooling(
                        features_in=embedding_size,
                        groups=pooling_groups,
                    )
                    features["last"] = np.memmap(
                        f"{args.out_prefix}.last.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            features["class"][indices] = _detach(features_[1])
            if args.output_hidden_states:
                # all embeddings
                for i, feature_ in enumerate(features_[2]):
                    name_ = f"embeddings_{i + 1:02d}"
                    feature_ = poolings[name_](feature_)
                    features[name_][indices] = _detach(feature_)
            else:
                feature_ = poolings[name_](features_[0])
                features["last"][indices] = _detach(feature_)