Пример #1
0
class CometWriter:
    def __init__(self,
                 logger,
                 project_name: Optional[str] = None,
                 experiment_name: Optional[str] = None,
                 api_key: Optional[str] = None,
                 log_dir: Optional[str] = None,
                 offline: bool = False,
                 **kwargs):
        if not _COMET_AVAILABLE:
            raise ImportError(
                "You want to use `comet_ml` logger which is not installed yet,"
                " install it with `pip install comet-ml`.")

        self.project_name = project_name
        self.experiment_name = experiment_name
        self.kwargs = kwargs

        self.timer = Timer()

        if (api_key is not None) and (log_dir is not None):
            self.mode = "offline" if offline else "online"
            self.api_key = api_key
            self.log_dir = log_dir

        elif api_key is not None:
            self.mode = "online"
            self.api_key = api_key
            self.log_dir = None
        elif log_dir is not None:
            self.mode = "offline"
            self.log_dir = log_dir
        else:
            logger.warning(
                "CometLogger requires either api_key or save_dir during initialization."
            )

        if self.mode == "online":
            self.experiment = CometExperiment(
                api_key=self.api_key,
                project_name=self.project_name,
                **self.kwargs,
            )
        else:
            self.experiment = CometOfflineExperiment(
                offline_directory=self.log_dir,
                project_name=self.project_name,
                **self.kwargs,
            )

        if self.experiment_name:
            self.experiment.set_name(self.experiment_name)

    def set_step(self, step, epoch=None, mode='train') -> None:
        self.mode = mode
        self.step = step
        self.epoch = epoch
        if step == 0:
            self.timer.reset()
        else:
            duration = self.timer.check()
            self.add_scalar({'steps_per_sec': 1 / duration})

    def log_hyperparams(self, params: Dict[str, Any]) -> None:
        self.experiment.log_parameters(params)

    def log_code(self, file_name=None, folder='models/') -> None:
        self.experiment.log_code(file_name=file_name, folder=folder)

    def add_scalar(self,
                   metrics: Dict[str, Union[torch.Tensor, float]],
                   step: Optional[int] = None,
                   epoch: Optional[int] = None) -> None:
        metrics_renamed = {}
        for key, val in metrics.items():
            tag = '{}/{}'.format(key, self.mode)
            if is_tensor(val):
                metrics_renamed[tag] = val.cpu().detach()
            else:
                metrics_renamed[tag] = val
        if epoch is None:
            self.experiment.log_metrics(metrics_renamed,
                                        step=self.step,
                                        epoch=self.epoch)
        else:
            self.experiment.log_metrics(metrics_renamed, epoch=epoch)

    def add_plot(self, figure_name, figure):
        """
        Primarily for log gate plots
        """
        self.experiment.log_figure(figure_name=figure_name, figure=figure)

    def add_hist3d(self, hist, name):
        """
        Primarily for log gate plots
        """
        self.experiment.log_histogram_3d(hist, name=name)

    def reset_experiment(self):
        self.experiment = None

    def finalize(self) -> None:
        self.experiment.end()
        self.reset_experiment()
Пример #2
0
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq,
          customize, comet_offline):

    # tf.debugging.enable_check_numerics()
    """Train a model defined by config"""
    config_file_path = config
    config, config_file_stem = parse_config(config,
                                            nepochs=nepochs,
                                            weights=weights)

    if plot_freq:
        config["callbacks"]["plot_freq"] = plot_freq

    if customize:
        config = customization_functions[customize](config)

    # Decide tf.distribute.strategy depending on number of available GPUs
    horovod_enabled = config["setup"]["horovod_enabled"]
    if horovod_enabled:
        num_gpus = initialize_horovod()
    else:
        strategy, num_gpus = get_strategy()

    outdir = ""
    if not horovod_enabled or hvd.rank() == 0:
        outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_",
                                       suffix=platform.node())
        shutil.copy(
            config_file_path, outdir + "/config.yaml"
        )  # Copy the config file to the train dir for later reference

    try:
        if comet_offline:
            print("Using comet-ml OfflineExperiment, saving logs locally.")
            from comet_ml import OfflineExperiment

            experiment = OfflineExperiment(
                project_name="particleflow-tf",
                auto_metric_logging=True,
                auto_param_logging=True,
                auto_histogram_weight_logging=True,
                auto_histogram_gradient_logging=False,
                auto_histogram_activation_logging=False,
                offline_directory=outdir + "/cometml",
            )
        else:
            print("Using comet-ml Experiment, streaming logs to www.comet.ml.")
            from comet_ml import Experiment

            experiment = Experiment(
                project_name="particleflow-tf",
                auto_metric_logging=True,
                auto_param_logging=True,
                auto_histogram_weight_logging=True,
                auto_histogram_gradient_logging=False,
                auto_histogram_activation_logging=False,
            )
    except Exception as e:
        print("Failed to initialize comet-ml dashboard: {}".format(e))
        experiment = None
    if experiment:
        experiment.set_name(outdir)
        experiment.log_code("mlpf/tfmodel/model.py")
        experiment.log_code("mlpf/tfmodel/utils.py")
        experiment.log_code(config_file_path)

    ds_train, num_train_steps = get_datasets(config["train_test_datasets"],
                                             config, num_gpus, "train")
    ds_test, num_test_steps = get_datasets(config["train_test_datasets"],
                                           config, num_gpus, "test")
    ds_val, ds_info = get_heptfds_dataset(
        config["validation_datasets"][0],
        config,
        num_gpus,
        "test",
        config["setup"]["num_events_validation"],
        supervised=False,
    )
    ds_val = ds_val.batch(5)

    if ntrain:
        ds_train = ds_train.take(ntrain)
        num_train_steps = ntrain
    if ntest:
        ds_test = ds_test.take(ntest)
        num_test_steps = ntest

    print("num_train_steps", num_train_steps)
    print("num_test_steps", num_test_steps)
    total_steps = num_train_steps * config["setup"]["num_epochs"]
    print("total_steps", total_steps)

    if horovod_enabled:
        model, optim_callbacks, initial_epoch = model_scope(
            config, total_steps, weights, horovod_enabled)
    else:
        with strategy.scope():
            model, optim_callbacks, initial_epoch = model_scope(
                config, total_steps, weights)

    callbacks = prepare_callbacks(
        config,
        outdir,
        ds_val,
        comet_experiment=experiment,
        horovod_enabled=config["setup"]["horovod_enabled"])

    verbose = 1
    if horovod_enabled:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        verbose = 1 if hvd.rank() == 0 else 0

        num_train_steps /= hvd.size()
        num_test_steps /= hvd.size()

    callbacks.append(optim_callbacks)

    model.fit(
        ds_train.repeat(),
        validation_data=ds_test.repeat(),
        epochs=initial_epoch + config["setup"]["num_epochs"],
        callbacks=callbacks,
        steps_per_epoch=num_train_steps,
        validation_steps=num_test_steps,
        initial_epoch=initial_epoch,
        verbose=verbose,
    )