Пример #1
0
def test_register_record_function(tmpdir):

    use_cuda = torch.cuda.is_available()
    pytorch_profiler = PyTorchProfiler(
        export_to_chrome=False,
        use_cuda=use_cuda,
        dirpath=tmpdir,
        filename="profiler",
        schedule=None,
        on_trace_ready=None,
    )

    class TestModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.layer = torch.nn.Sequential(torch.nn.Linear(1, 1), torch.nn.ReLU(), torch.nn.Linear(1, 1))

    model = TestModel()
    input = torch.rand((1, 1))

    if use_cuda:
        model = model.cuda()
        input = input.cuda()

    with pytorch_profiler.profile("a"):
        with RegisterRecordFunction(model):
            model(input)

    pytorch_profiler.describe()
    event_names = [e.name for e in pytorch_profiler.function_events]
    assert "[pl][module]torch.nn.modules.container.Sequential: layer" in event_names
    assert "[pl][module]torch.nn.modules.linear.Linear: layer.0" in event_names
    assert "[pl][module]torch.nn.modules.activation.ReLU: layer.1" in event_names
    assert "[pl][module]torch.nn.modules.linear.Linear: layer.2" in event_names
Пример #2
0
def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
    """Ensure that the profiler can be given to the training and default step are properly recorded. """

    if use_output_filename:
        output_filename = os.path.join(tmpdir, "profiler.txt")
    else:
        output_filename = None

    profiler = PyTorchProfiler(output_filename=output_filename)

    model = BoringModel()
    trainer = Trainer(
        fast_dev_run=True,
        profiler=profiler,
        accelerator="ddp",
        gpus=2,
    )
    trainer.fit(model)

    enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0

    if enabled:
        assert len(profiler.summary()) > 0
        assert set(profiler.profiled_actions.keys()) == {
            'training_step_and_backward', 'validation_step'
        }
    else:
        assert profiler.summary() is None
        assert set(profiler.profiled_actions.keys()) == set()

    # todo (tchaton) add support for all ranks
    if use_output_filename and os.getenv("LOCAL_RANK") == "0":
        data = Path(profiler.output_fname).read_text()
        assert len(data) > 0
Пример #3
0
def test_pytorch_profiler_trainer_ddp(tmpdir):
    """Ensure that the profiler can be given to the training and default step are properly recorded. """
    pytorch_profiler = PyTorchProfiler(dirpath=None, filename="profiler")
    model = BoringModel()
    trainer = Trainer(
        max_epochs=1,
        default_root_dir=tmpdir,
        limit_train_batches=2,
        limit_val_batches=2,
        profiler=pytorch_profiler,
        accelerator="ddp",
        gpus=2,
    )
    trainer.fit(model)

    assert len(pytorch_profiler.summary()) > 0
    assert set(pytorch_profiler.profiled_actions) == {
        'training_step_and_backward', 'validation_step'
    }

    files = sorted(f for f in os.listdir(pytorch_profiler.dirpath)
                   if "fit" in f)
    rank = int(os.getenv("LOCAL_RANK", "0"))
    expected = f"fit-profiler-{rank}.txt"
    assert files[rank] == expected

    path = os.path.join(pytorch_profiler.dirpath, expected)
    data = Path(path).read_text("utf-8")
    assert len(data) > 0
Пример #4
0
def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
    """This test check emit_nvtx is correctly supported."""
    profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)

    model = BoringModel()
    trainer = Trainer(fast_dev_run=True, profiler=profiler, gpus=1)
    trainer.fit(model)
Пример #5
0
def test_pytorch_profiler_trainer_test(tmpdir):
    """Ensure that the profiler can be given to the trainer and test step are properly recorded. """
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profile",
                                       schedule=None)
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_test_batches=2,
        profiler=pytorch_profiler,
    )
    trainer.test(model)

    assert sum(e.name == 'test_step' for e in pytorch_profiler.function_events)

    path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt"
    assert path.read_text("utf-8")

    if _KINETO_AVAILABLE:
        files = sorted(
            [file for file in os.listdir(tmpdir) if file.endswith('.json')])
        assert any(f'test-{pytorch_profiler.filename}' in f for f in files)
        path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt"
        assert path.read_text("utf-8")
Пример #6
0
def test_pytorch_profiler_trainer(fn, step_name, boring_model_cls, tmpdir):
    """Ensure that the profiler can be given to the trainer and test step are properly recorded."""
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profile",
                                       schedule=None)
    model = boring_model_cls()
    model.predict_dataloader = model.train_dataloader
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      limit_test_batches=2,
                      profiler=pytorch_profiler)
    getattr(trainer, fn)(model)

    assert sum(e.name == f"{step_name}_step"
               for e in pytorch_profiler.function_events)

    path = pytorch_profiler.dirpath / f"{fn}-{pytorch_profiler.filename}.txt"
    assert path.read_text("utf-8")

    if _KINETO_AVAILABLE:
        files = sorted(file for file in os.listdir(tmpdir)
                       if file.endswith(".json"))
        assert any(f"{fn}-{pytorch_profiler.filename}" in f for f in files)
        path = pytorch_profiler.dirpath / f"{fn}-{pytorch_profiler.filename}.txt"
        assert path.read_text("utf-8")
Пример #7
0
def test_pytorch_profiler_nested(tmpdir):
    """Ensure that the profiler handles nested context"""

    pytorch_profiler = PyTorchProfiler(profiled_functions=["a", "b", "c"],
                                       use_cuda=False,
                                       output_filename=os.path.join(
                                           tmpdir, "profiler.txt"))

    with pytorch_profiler.profile("a"):
        a = torch.ones(42)
        with pytorch_profiler.profile("b"):
            b = torch.zeros(42)
        with pytorch_profiler.profile("c"):
            _ = a + b

    pa = pytorch_profiler.profiled_actions

    # From PyTorch 1.8.0, less operation are being traced.
    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
        expected_ = {
            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'],
            'b': ['zeros', 'empty', 'zero_'],
            'c': ['add'],
        }
    # From PyTorch 1.6.0, more operation are being traced.
    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
        expected_ = {
            'a': [
                'ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_',
                'add', 'empty'
            ],
            'b': ['zeros', 'empty', 'zero_', 'fill_'],
            'c': ['add', 'empty'],
        }
    else:
        expected_ = {
            'a': ['add'],
            'b': [],
            'c': ['add'],
        }

    for n in ('a', 'b', 'c'):
        pa[n] = [e.name for e in pa[n]]
        if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
            pa[n] = [e.replace("aten::", "") for e in pa[n]]
        assert pa[n] == expected_[n]
Пример #8
0
def test_pytorch_profiler_nested(tmpdir):
    """Ensure that the profiler handles nested context"""

    pytorch_profiler = PyTorchProfiler(record_functions={"a", "b", "c"},
                                       use_cuda=False,
                                       dirpath=tmpdir,
                                       filename="profiler",
                                       schedule=None)

    with pytorch_profiler.profile("a"):
        a = torch.ones(42)
        with pytorch_profiler.profile("b"):
            b = torch.zeros(42)
        with pytorch_profiler.profile("c"):
            _ = a + b

    pytorch_profiler.describe()

    events_name = {e.name for e in pytorch_profiler.function_events}

    names = {"a", "b", "c"}
    ops = {"add", "empty", "fill_", "ones", "zero_", "zeros"}
    if _TORCH_GREATER_EQUAL_1_7:
        ops = {"aten::" + op for op in ops}

    expected = names.union(ops)
    assert events_name == expected, (events_name, torch.__version__,
                                     platform.system())
Пример #9
0
def build_profiler(name):
    if name == 'inference':
        return InferenceProfiler()
    elif name == 'pytorch':
        from pytorch_lightning.profiler import PyTorchProfiler
        return PyTorchProfiler(use_cuda=True,
                               profile_memory=True,
                               row_limit=100)
    elif name is None:
        return PassThroughProfiler()
    else:
        raise ValueError(f'Invalid profiler: {name}')
Пример #10
0
def build_profiler(name):
    if name == 'inference':
        return InferenceProfiler()
    elif name == 'pytorch':
        from pytorch_lightning.profiler import PyTorchProfiler
        # TODO: this profiler will be introduced after upgrading pl dependency to 1.3.0 @zehong
        return PyTorchProfiler(use_cuda=True,
                               profile_memory=True,
                               row_limit=100)
    elif name is None:
        return PassThroughProfiler()
    else:
        raise ValueError(f'Invalid profiler: {name}')
Пример #11
0
def test_pytorch_profiler_deepcopy(tmpdir):
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profiler",
                                       schedule=None)
    pytorch_profiler.start("on_train_start")
    torch.tensor(1)
    pytorch_profiler.describe()
    assert deepcopy(pytorch_profiler)
Пример #12
0
def test_pytorch_profiler_trainer_validate(tmpdir):
    """Ensure that the profiler can be given to the trainer and validate function are properly recorded."""
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profile",
                                       schedule=None)
    model = BoringModel()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=1,
                      limit_val_batches=2,
                      profiler=pytorch_profiler)
    trainer.validate(model)

    assert sum(e.name == "validation_step"
               for e in pytorch_profiler.function_events)

    path = pytorch_profiler.dirpath / f"validate-{pytorch_profiler.filename}.txt"
    assert path.read_text("utf-8")
Пример #13
0
def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir):
    """Ensure that the profiler can be given to the trainer and test step are properly recorded."""
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profile")
    model = boring_model_cls()
    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, fast_dev_run=fast_dev_run, profiler=pytorch_profiler)
    trainer.fit(model)

    assert sum(e.name == "validation_step" for e in pytorch_profiler.function_events)

    path = pytorch_profiler.dirpath / f"fit-{pytorch_profiler.filename}.txt"
    assert path.read_text("utf-8")

    if _KINETO_AVAILABLE:
        files = sorted(file for file in os.listdir(tmpdir) if file.endswith(".json"))
        assert any(f"fit-{pytorch_profiler.filename}" in f for f in files)
        path = pytorch_profiler.dirpath / f"fit-{pytorch_profiler.filename}.txt"
        assert path.read_text("utf-8")
Пример #14
0
def test_pytorch_profiler_trainer_predict(tmpdir):
    """Ensure that the profiler can be given to the trainer and predict function are properly recorded. """
    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir,
                                       filename="profile",
                                       schedule=None)
    model = BoringModel()
    model.predict_dataloader = model.train_dataloader
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_predict_batches=2,
        profiler=pytorch_profiler,
    )
    trainer.predict(model)

    assert sum(e.name == 'predict_step'
               for e in pytorch_profiler.function_events)
    path = pytorch_profiler.dirpath / f"predict-{pytorch_profiler.filename}.txt"
    assert path.read_text("utf-8")
Пример #15
0
def test_pytorch_profiler_nested(tmpdir):
    """Ensure that the profiler handles nested context"""

    pytorch_profiler = PyTorchProfiler(record_functions={"a", "b", "c"},
                                       use_cuda=False,
                                       dirpath=tmpdir,
                                       filename="profiler",
                                       schedule=None)

    with pytorch_profiler.profile("a"):
        a = torch.ones(42)
        with pytorch_profiler.profile("b"):
            b = torch.zeros(42)
        with pytorch_profiler.profile("c"):
            _ = a + b

    pytorch_profiler.describe()

    events_name = {e.name for e in pytorch_profiler.function_events}

    if platform.system() == "Windows":
        expected = {
            'a', 'add', 'b', 'c', 'profiler::_record_function_enter',
            'profiler::_record_function_exit'
        }
    else:
        expected = {
            'signed char', 'add', 'profiler::_record_function_exit', 'bool',
            'char', 'profiler::_record_function_enter'
        }

    if Version(torch.__version__) >= Version("1.6.0"):
        expected = {
            'add', 'zeros', 'ones', 'zero_', 'b', 'fill_', 'c', 'a', 'empty'
        }

    if Version(torch.__version__) >= Version("1.7.0"):
        expected = {
            'aten::zeros', 'aten::add', 'aten::zero_', 'c', 'b', 'a',
            'aten::fill_', 'aten::empty', 'aten::ones'
        }

    assert events_name == expected, (events_name, torch.__version__,
                                     platform.system())
Пример #16
0
def test_profile_callbacks(tmpdir):
    """Checks if profiling callbacks works correctly, specifically when there are two of the same callback type."""

    pytorch_profiler = PyTorchProfiler(dirpath=tmpdir, filename="profiler")
    model = BoringModel()
    trainer = Trainer(
        default_root_dir=tmpdir,
        fast_dev_run=1,
        profiler=pytorch_profiler,
        callbacks=[EarlyStopping("val_loss"), EarlyStopping("train_loss")],
    )
    trainer.fit(model)
    assert sum(
        e.name == "[pl][profile][Callback]EarlyStopping{'monitor': 'val_loss', 'mode': 'min'}.on_validation_start"
        for e in pytorch_profiler.function_events
    )
    assert sum(
        e.name == "[pl][profile][Callback]EarlyStopping{'monitor': 'train_loss', 'mode': 'min'}.on_validation_start"
        for e in pytorch_profiler.function_events
    )
Пример #17
0
def process_args(args=None, return_io=False):
    """
    Process arguments for running training
    """
    if not isinstance(args, argparse.Namespace):
        args = parse_args(args)

    args.loader_kwargs = dict()

    targs = dict(max_epochs=args.epochs, )

    targs['accumulate_grad_batches'] = args.accumulate

    env = None

    if args.ipu:
        targs['accelerator'] = 'ipu'
        targs['devices'] = process_gpus(args.gpus)
    else:
        targs['gpus'] = process_gpus(args.gpus)
        targs['num_nodes'] = args.num_nodes
        if args.lsf:
            ##########################################################################################
            # Currently coding against pytorch-lightning 1.4.3
            ##########################################################################################
            if args.num_workers > 4:
                print0(
                    "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4",
                    file=sys.stderr)
                args.num_workers = 4
            args.loader_kwargs[
                'num_workers'] = 1  # Set as a default. This will get overridden elsewhere
            args.loader_kwargs['multiprocessing_context'] = 'spawn'
            env = LSFEnvironment()
        elif args.slurm:
            env = SLURMEnvironment()

        if env is not None:
            global RANK
            global SIZE
            try:
                RANK = env.global_rank()
                SIZE = env.world_size()
            except:
                print(
                    ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1",
                    file=sys.stderr)
                RANK = 0
                SIZE = 1

        if targs['gpus'] is not None:
            targs['accelerator'] = 'gpu'
            if targs['gpus'] == 1:
                targs['devices'] = 1
            else:
                if env is None:
                    raise ValueError(
                        'Please specify environment (--lsf or --slurm) if using more than one GPU'
                    )
                # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']]
                # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda')
                torch.cuda.set_device(env.local_rank())
                targs['devices'] = targs['gpus']
                targs['strategy'] = DDPStrategy(
                    find_unused_parameters=False,
                    cluster_environment=env,
                    #accelerator=GPUAccelerator(),
                    #parallel_devices=parallel_devices,
                    #precision_plugin=precision_plugin,
                )

                print(
                    "---- Rank %s  -  Using GPUAccelerator with DDPStrategy" %
                    env.global_rank(),
                    file=sys.stderr)
        else:
            targs['accelerator'] = 'cpu'

    del args.gpus

    if args.sanity:
        if isinstance(args.sanity, str):
            args.sanity = int(args.sanity)
        else:
            args.sanity = 4000
        targs['limit_train_batches'] = args.sanity
        targs['limit_val_batches'] = args.sanity // 4

    if args.lr_find:
        targs['auto_lr_find'] = True
    del args.lr_find

    if args.checkpoint is not None:
        if os.path.exists(args.checkpoint):
            targs['resume_from_checkpoint'] = args.checkpoint
        else:
            warnings.warn(
                "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist."
            )
            args.checkpoint = None

    if args.cuda_profile:
        targs['profiler'] = PyTorchProfiler(
            filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True)

    targs['replace_sampler_ddp'] = False

    args.loader_kwargs = dict()

    # make sure we are classifying if we are using adding classifier layers
    # to a resnet features model
    if args.features_checkpoint is not None:
        if args.manifold:
            raise ValueError(
                'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)'
            )
        args.classify = True

    data_mod = DeepIndexDataModule(args,
                                   keep_open=True,
                                   seed=args.seed + RANK,
                                   rank=RANK,
                                   size=SIZE)

    # if classification problem, use the number of taxa as the number of outputs
    if args.classify:
        args.n_outputs = data_mod.dataset.n_outputs

    args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab)

    model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table)

    if args.num_workers > 0:
        data_mod.dataset.close()

    ret = [model, args, targs]
    if return_io:
        ret.append(io)

    ret.append(data_mod)

    return tuple(ret)
Пример #18
0
def test_pytorch_profiler_raises(pytorch_profiler):
    """Ensure errors are raised where expected."""
    with pytest.raises(
            MisconfigurationException,
            match="profiled_functions` and `PyTorchProfiler.record"):
        PyTorchProfiler(profiled_functions=["a"], record_functions=["b"])
Пример #19
0
def pytorch_profiler(tmpdir):
    return PyTorchProfiler(dirpath=tmpdir, filename="profiler")
Пример #20
0
def test_v1_5_0_legacy_profiler_argument():
    with pytest.deprecated_call(match="renamed to `record_functions` in v1.3"):
        PyTorchProfiler(profiled_functions=[])
Пример #21
0
def cli_main():
    pl.seed_everything(1234)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument('--base_folders', nargs='+', default=[], required=True)
    parser.add_argument('--datasets', nargs='+', default=[], required=True)
    parser.add_argument('--shuffle', action="store_true", default=False)
    parser.add_argument('--use_tpu', action="store_true", default=False)
    parser.add_argument('--memory_profile', action="store_true", default=False)
    parser.add_argument('--tags', nargs='*', default=[])
    parser = UTWRS.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # ------------
    # data path
    # ------------
    file_paths = []
    max_seq_length = 0
    max_summary_length = 0

    if "BBC" in args.datasets:
        i = args.datasets.index("BBC")
        file_paths.append(get_file_paths(args.base_folders[i]))
        max_seq_length = max(get_max_seq_len(args.base_folders[i]),
                             max_seq_length)
        max_summary_length = max(get_max_summary_len(args.base_folders[i]),
                                 max_summary_length)

    if "OVSD" in args.datasets:
        i = args.datasets.index("OVSD")
        file_paths.append(get_file_paths(args.base_folders[i]))
        max_seq_length = max(get_max_seq_len(args.base_folders[i]),
                             max_seq_length)
        max_summary_length = max(get_max_summary_len(args.base_folders[i]),
                                 max_summary_length)

    if file_paths == []:
        raise UnsupportedOperation("--dataset only support BBC or OVSD.")

    # ------------
    # data args
    # ------------
    # Add <START> and <END> token
    args.enc_seq_len = max_seq_length + 2
    args.dec_seq_len = max_summary_length + 2

    # ------------
    # Split train/test
    # ------------
    print(f"\nTotal number of videos: {sum([len(i) for i in file_paths])}")
    print(f"Max length of videos: {max_seq_length}")
    print(f"Max length of summary: {max_summary_length}\n")

    train_paths = []
    test_paths = []

    for dataset in file_paths:
        np.random.shuffle(dataset)
        train_paths.extend(dataset[:-2])
        test_paths.extend(dataset[-2:])

    # ------------
    # K-fold
    # ------------

    kfold = StratifiedKFold(n_splits=3, shuffle=False)

    # Generate data index for kfold
    X = [0] * len(train_paths)
    Y = []
    for i, dataset in enumerate(file_paths):
        Y += [i] * (len(dataset) - 2)

    train_paths = np.array(train_paths)
    for k, (train, val) in enumerate(
            tqdm(kfold.split(X, Y), total=kfold.get_n_splits())):
        print(f"Training data: f{train_paths[train]}")
        print(f"Validation data: f{train_paths[val]}")
        # ------------
        # data loader
        # ------------
        data_loader = OVSDBBCDataModule(max_seq_length,
                                        max_summary_length,
                                        args.d_model,
                                        train_paths[train],
                                        train_paths[val],
                                        shuffle=args.shuffle,
                                        use_tpu=args.use_tpu)

        # ------------
        # model
        # ------------
        model = UTWRS(args, SRC_PAD_TOKEN, TRG_PAD_TOKEN)

        # ------------
        # neptune logger
        # ------------
        neptune_logger = NeptuneLogger(project_name="guyleaf/UTWRS",
                                       params=vars(args),
                                       experiment_name=f"{k+1}-fold_logger",
                                       tags=args.tags)
        neptune_logger.experiment.log_text("training_data",
                                           ','.join(train_paths[train]))
        neptune_logger.experiment.log_text("validation_data",
                                           ','.join(train_paths[val]))

        # ------------
        # checkpoint
        # ------------
        model_checkpoint = ModelCheckpoint(
            dirpath="checkpoints",
            filename='{epoch:02d}_{test_loss:.2f}',
            save_top_k=3,
            monitor='test_loss',
            mode='min')

        # ------------
        # profiler
        # ------------
        profiler = PyTorchProfiler(
            output_filename=f"profiles/{k}-fold_profiler",
            profile_memory=True,
            sort_by_key="cuda_memory_usage",
            row_limit=50,
            enabled=args.memory_profile)

        # ------------
        # training
        # ------------
        trainer = pl.Trainer.from_argparse_args(
            args,
            logger=neptune_logger,
            profiler=profiler,
            checkpoint_callback=model_checkpoint,
            track_grad_norm=2,
            log_every_n_steps=100)
        trainer.fit(model, data_loader)

        # Log model checkpoint to Neptune
        for k in model_checkpoint.best_k_models.keys():
            model_name = 'checkpoints/' + k.split('/')[-1]
            neptune_logger.experiment.log_artifact(k, model_name)

        # Log score of the best model checkpoint.
        neptune_logger.experiment.set_property(
            'best_model_loss', model_checkpoint.best_model_score.tolist())
        if args.profiler:
            neptune_logger.experiment.log_artifact('profiles')
Пример #22
0
def pytorch_profiler(tmpdir):
    profiler = PyTorchProfiler(output_filename=os.path.join(
        tmpdir, "profiler.txt"),
                               local_rank=0)
    return profiler