Exemplo n.º 1
0
    def show_experiment_config(self) -> None:
        """
        print experiment config
        -----------------------
        :used attr: experiment_name: [str], e.g. 'exp0' or 'all'
        """
        from nerblackbox.modules.utils.env_variable import env_variable

        if self.experiment_name != "all":
            path_experiment_config = join(
                env_variable("DIR_EXPERIMENT_CONFIGS"),
                f"{self.experiment_name}.ini")
            if isfile(path_experiment_config):
                with open(path_experiment_config, "r") as file:
                    lines = file.read()

                print(f"> experiment_config = {path_experiment_config}")
                print()
                print(lines)
            else:
                print(
                    f"> experiment_config = {path_experiment_config} does not exist."
                )
        else:
            experiment_configs = glob.glob(
                join(env_variable("DIR_EXPERIMENT_CONFIGS"), "*.ini"))
            experiment_configs = [
                elem.split("/")[-1].strip(".ini")
                for elem in experiment_configs
            ]
            experiment_configs = [
                elem for elem in experiment_configs if elem != "default"
            ]
            for experiment_config in experiment_configs:
                print(experiment_config)
Exemplo n.º 2
0
def get_dataset_path(dataset: str, subset: str = "") -> str:
    """
    get dataset path for dataset

    Args:
        dataset: e.g. 'suc', 'swedish_ner_corpus'
        subset: e.g. 'original_cased'

    Returns:
        dataset_path: path to dataset directory
    """
    if len(subset):
        return join(env_variable("DIR_DATASETS"), dataset, subset)
    else:
        return join(env_variable("DIR_DATASETS"), dataset)
    def extract_best_single_run(self) -> None:
        if self.experiment is not None and self.single_runs is not None:
            _df_best_single_run = self.single_runs.iloc[0, :]

            assert (
                self.name is not None
            ), f"ERROR! self.name is None, extract_best_single_run() failed."
            checkpoint = join(
                env_variable("DIR_CHECKPOINTS"),
                self.name,
                _df_best_single_run[("info", "run_name_nr")],
                epoch2checkpoint(_df_best_single_run[("metrics", "EPOCH_BEST")]),
            )

            fields_info = ["run_id", "run_name_nr"]

            self.best_single_run = dict(
                **{
                    "exp_id": self._id,
                    "exp_name": self.name,
                    "checkpoint": checkpoint if isfile(checkpoint) else None,
                },
                **{
                    field: _df_best_single_run[("info", field)] for field in fields_info
                },
                **{
                    field: _df_best_single_run[("metrics", field)]
                    for field in self.METRICS_PLUS.values()
                },
            )
        else:
            self.best_single_run = dict()
Exemplo n.º 4
0
    def _get_config(
        self, experiment_name: str
    ) -> Tuple[Dict[str, Dict[str, str]], List[str]]:
        """
        get ConfigParser instance and derive config dictionary from it

        Args:
            experiment_name: e.g. 'exp1', 'default

        Returns:
            _config_dict: w/ keys = sections [str], values = [dict] w/ key: value = params: values
            _run_names: e.g. ["runA", "runB"]
        """
        config_path = join(
            env_variable("DIR_EXPERIMENT_CONFIGS"), f"{experiment_name}.ini"
        )
        if not os.path.isfile(config_path):
            raise Exception(f"config file at {config_path} does not exist")

        _config = ConfigParser()
        _config.read(config_path)
        _config_dict: Dict[str, Dict[str, Any]] = {
            s: dict(_config.items(s)) for s in _config.sections()
        }  # {'hparams': {'monitor': 'val_loss'}}
        _config_dict = {
            s: {k: self._convert(k, v) for k, v in subdict.items()}
            for s, subdict in _config_dict.items()
        }

        # combine sections 'dataset', 'model' & 'settings' to single section 'params'
        _config_dict["params"] = dict()
        for s in ["dataset", "model", "settings"]:
            if s in _config_dict.keys():
                _config_dict["params"].update(_config_dict[s])
                _config_dict.pop(s)

        # derive uncased
        if (
            "uncased" not in _config_dict["params"].keys()
            and "pretrained_model_name" in _config_dict["params"]
        ):
            if "uncased" in _config_dict["params"]["pretrained_model_name"]:
                _config_dict["params"]["uncased"] = True
            elif "cased" in _config_dict["params"]["pretrained_model_name"]:
                _config_dict["params"]["uncased"] = False
            else:
                _config_dict["params"]["uncased"] = False
                print(
                    "ATTENTION! could not derive uncased = True/False from pretrained_model_name."
                    " => assume model is cased"
                )

        _run_names = [
            run_name for run_name in _config.sections() if run_name.startswith("run")
        ]

        return _config_dict, _run_names
def assert_that_experiment_hasnt_been_run_before(experiment_name: str) -> None:
    """
    Args:
        experiment_name: e.g. 'my_experiment'
    """
    experiment_directory = join(env_variable("DIR_CHECKPOINTS"),
                                experiment_name)
    if isdir(experiment_directory):
        raise Exception(
            f"ERROR! experiment = {experiment_name} has been run before ({experiment_directory} exists)"
        )
Exemplo n.º 6
0
def get_available_datasets() -> List[str]:
    """
    get datasets that are available in DIR_DATASETS directory

    Returns:
        available datasets: e.g. ['suc', 'swedish_ner_corpus']
    """
    dir_datasets = env_variable("DIR_DATASETS")
    return [
        folder
        for folder in os.listdir(dir_datasets)
        if os.path.isdir(join(dir_datasets, folder))
    ]
def _parse_args(_parser, _args):
    """
    :param _parser: [argparse ArgumentParser]
    :param _args:   [argparse arguments]
    :return _params:   [argparse.Namespace] attr: experiment_name, run_name, device, fp16
    :return _log_dirs: [argparse.Namespace] attr: mlflow, tensorboard
    """
    # parsing
    _params = None
    for group in _parser._action_groups:
        group_dict = {
            a.dest: getattr(_args, a.dest, None)
            for a in group._group_actions
        }
        if group.title == "args_general":
            group_dict["device"] = torch.device(
                "cuda" if torch.cuda.is_available()
                and group_dict["device"] == "gpu" else "cpu")
            group_dict["fp16"] = (True if group_dict["fp16"]
                                  and group_dict["device"].type == "cuda" else
                                  False)
            group_dict["from_config"] = bool(group_dict["from_config"])
            if len(group_dict["run_name"]) == 0:
                group_dict["run_name"] = None
            _params = argparse.Namespace(**group_dict)

    # log_dirs
    _log_dirs_dict = {
        "mlflow": env_variable("DIR_MLFLOW"),
        "tensorboard": env_variable("DIR_TENSORBOARD"),
        "checkpoints": env_variable("DIR_CHECKPOINTS"),
        "log_file": env_variable("LOG_FILE"),
        "mlflow_file": env_variable("MLFLOW_FILE"),
    }
    _log_dirs = argparse.Namespace(**_log_dirs_dict)

    return _params, _log_dirs
Exemplo n.º 8
0
    def clear_data(self) -> None:
        """
        :used attr: clear_all [bool] if True, clear not only checkpoints but also mlflow, tensorboard and logs
        """
        data_dir = env_variable("DATA_DIR")
        results_dir = join(data_dir, "results")
        assert isdir(results_dir), f"directory {results_dir} does not exist."

        # checkpoints
        objects_to_remove = glob.glob(join(results_dir, "checkpoints",
                                           "*"))  # list

        # results (mlflow, tensorboard, ..)
        if self.results:
            results_files = (glob.glob(join(results_dir, "mlruns", "*")) +
                             glob.glob(join(results_dir, "mlruns", ".*")) +
                             glob.glob(join(results_dir, "tensorboard", "*")) +
                             glob.glob(join(results_dir, "logs.log")) +
                             glob.glob(join(results_dir, "*.npy")))
            objects_to_remove.extend(results_files)

        if len(objects_to_remove) == 0:
            print(f"There is no data to remove in {results_dir}")
        else:
            for elem in objects_to_remove:
                print(elem)
            while 1:
                answer = input("Do you want to remove the above files? (y/n) ")
                if answer == "y":
                    for elem in objects_to_remove:
                        if isfile(elem):
                            os.remove(elem)
                        elif isdir(elem):
                            shutil.rmtree(elem, ignore_errors=False)
                        else:
                            raise ValueError(
                                f"object {elem} is neither a file nor a dir and cannot be removed"
                            )
                    print(f"Files removed")
                    break
                elif answer == "n":
                    print(f"No files removed")
                    break
                else:
                    print("Please enter either y or n")
Exemplo n.º 9
0
    def _create_data_directory(self) -> None:
        if resource_isdir(Requirement.parse("nerblackbox"),
                          "nerblackbox/modules/data"):
            data_source = resource_filename(Requirement.parse("nerblackbox"),
                                            "nerblackbox/modules/data")

            data_dir = env_variable("DATA_DIR")
            if self.verbose:
                print("data_source =", data_source)
                print("data_target =", data_dir)

            if os.path.isdir(data_dir):
                print(f"init: target {data_dir} already exists")
            else:
                shutil.copytree(data_source, data_dir)
                print(f"init: target {data_dir} created")
        else:
            print("init not executed successfully")
            exit(0)
Exemplo n.º 10
0
    def _write_config_file(self) -> None:
        """
        write config file based on self.hparams
        """
        # assert that config file does not exist
        config_path = join(env_variable("DIR_EXPERIMENT_CONFIGS"),
                           f"{self.experiment_name}.ini")
        assert (
            isfile(config_path) is False
        ), f"ERROR! experiment config file {config_path} already exists!"

        # write config file: helper functions
        def _write(_str: str):
            f.write(_str + "\n")

        def _write_key_value(_key: str):
            assert (
                self.hparams is not None
            ), f"ERROR! self.hparams is None - _write_key_value() failed."
            if _key in self.hparams.keys():
                f.write(f"{_key} = {self.hparams[_key]}\n")

        # write config file
        with open(config_path, "w") as f:
            _write("[dataset]")
            for key in DATASET.keys():
                _write_key_value(key)

            _write("\n[model]")
            for key in MODEL.keys():
                _write_key_value(key)

            _write("\n[settings]")
            for key in SETTINGS.keys():
                _write_key_value(key)

            _write("\n[hparams]")
            for key in HPARAMS.keys():
                _write_key_value(key)

            _write("\n[runA]")
Exemplo n.º 11
0
def _get_model_checkpoint_directory(_params):
    """
    :param _params:     [argparse.Namespace] attr: experiment_name, run_name, pretrained_model_name, dataset_name, ..
    :return: model_checkpoint_directory [str]
    """
    return join(env_variable("DIR_CHECKPOINTS"), _params.experiment_run_name_nr)
Exemplo n.º 12
0
    def __init__(
            self,
            flag: str,
            usage: str = "cli",
            dataset_name: Optional[
                str] = None,  # analyze_data & set_up_dataset
            dataset_subset_name: Optional[str] = None,  # set_up_dataset
            modify: bool = True,  # set_up_dataset
            val_fraction: float = 0.3,  # set_up_dataset
            verbose: bool = False,
            experiment_name: Optional[str] = None,
            hparams: Optional[Dict[str, Union[str, int,
                                              bool]]] = None,  # run_experiment
            from_preset: Optional[str] = None,  # run_experiment
            from_config: bool = False,  # run_experiment
            run_name: Optional[str] = None,  # run_experiment
            device: str = "gpu",  # run_experiment
            fp16: bool = False,  # run_experiment
            text_input: Optional[str] = None,  # predict
            ids: Tuple[str,
                       ...] = (),  # get_experiments, get_experiments_results
            as_df: bool = True,  # get_experiments, get_experiments_results
            results: bool = False,  # clear_data
    ):
        """
        :param flag:                [str], e.g. 'analyze_data', 'set_up_dataset', 'run_experiment', ..
        :param usage:               [str] 'cli' or 'api'
        :param dataset_name:        [str] e.g. 'swedish_ner_corpus'
        :param dataset_subset_name: [str] e.g. 'simple_cased'
        :param modify:              [bool] if True: modify tags as specified in method modify_ner_tag_mapping()
        :param val_fraction:        [float] e.g. 0.3
        :param verbose:             [bool]
        :param experiment_name:     [str], e.g. 'exp0'
        :param hparams:             [dict], e.g. {'multiple_runs': '2'} with hparams to use            [HIERARCHY:  I]
        :param from_preset:         [str], e.g. 'adaptive' get experiment params & hparams from preset [HIERARCHY: II]
        :param from_config:         [bool] if True, get experiment params & hparams from config file   [ALTERNATIVE]
        :param run_name:            [str or None], e.g. 'runA'
        :param device:              [str]
        :param fp16:                [bool]
        :param text_input:          [str], e.g. 'this is some text that needs to be annotated'
        :param ids:                 [tuple of str], experiment_ids to include
        :param as_df:               [bool] if True, return pandas DataFrame, else return dict
        :param results:             [bool] if True, clear not only checkpoints but also mlflow, tensorboard and logs
        """
        self._assert_flag(flag)

        os.environ["MLFLOW_TRACKING_URI"] = env_variable("DIR_MLFLOW")

        self.flag = flag
        self.usage = usage
        self.dataset_name = dataset_name  # analyze_data & set_up_dataset
        self.dataset_subset_name = dataset_subset_name  # set_up_dataset
        self.modify = modify  # set_up_dataset
        self.val_fraction = val_fraction  # set_up_dataset
        self.verbose = verbose
        self.experiment_name = experiment_name
        self.hparams: Optional[Dict[str, Union[str, int,
                                               bool]]] = self._process_hparams(
                                                   hparams, from_preset)
        self.from_config: bool = from_config
        self.run_name = run_name  # run_experiment
        self.device = device  # run_experiment
        self.fp16 = fp16  # run_experiment
        self.text_input = text_input  # predict
        self.ids = ids  # get_experiments, get_experiments_results
        self.as_df = as_df  # get_experiments, get_experiments_results
        self.results = results  # clear_data

        if self.flag == "run_experiment":
            assert (self.hparams is None and self.from_config is True) or (
                self.hparams is not None and self.from_config is False), (
                    f"ERROR! Need to specify "
                    f"EITHER hparams (currently {self.hparams}) "
                    f"with or without from_preset (currently {from_preset}) "
                    f"OR from_config (currently {self.from_config}).")

            if self.from_config:
                path_experiment_config = join(
                    env_variable("DIR_EXPERIMENT_CONFIGS"),
                    f"{self.experiment_name}.ini",
                )
                if not isfile(path_experiment_config):
                    self._exit_gracefully(
                        f"experiment_config = {path_experiment_config} does not exist."
                    )
            else:
                assert (
                    self.hparams is not None
                ), f"ERROR! self.hparams is None but needs to be specified if dynamic arguments are used."
                for field in ["pretrained_model_name", "dataset_name"]:
                    if field not in self.hparams.keys():
                        field_displayed = ("model"
                                           if field == "pretrained_model_name"
                                           else "dataset")
                        self._exit_gracefully(
                            f"{field_displayed} is not specified but mandatory if dynamic arguments are used."
                        )

        data_dir = env_variable("DATA_DIR")
        if os.path.isdir(data_dir):
            self._set_client_and_get_experiments()
        else:
            # will be set in init() method
            self.client = None
            self.experiment_id2name = None
            self.experiment_name2id = None