示例#1
0
def build_summary(bench_config_path: str, base_experiment: str,
                  experimental_experiment: str,
                  download_base_path: str) -> List[ExperimentsDiff]:
    """Build summary and diffs of artifacts.

    bench_config_path: bench config file path. Can be the same one that was used to run
        these experiments.
    base_experiment: name of the experiment we're comparing against.
    experimental_experiment: name of the experiment we're comparing.
    download_base_path: base path under which live the stored artifacts of
        the benchmarking experiments.
    """
    config = load_yaml(bench_config_path)
    downloaded_names = set(
        download_artifacts(config, base_experiment, experimental_experiment,
                           download_base_path))
    experiment_diffs = []
    for n in downloaded_names:
        if isinstance(n, tuple) and len(n) == 2:
            (dataset_name, local_dir) = n
            e = ExperimentsDiff(dataset_name, base_experiment,
                                experimental_experiment, local_dir)
            if not e.empty:
                experiment_diffs.append(e)
    return experiment_diffs
示例#2
0
def setup_experiment(experiment: Dict[str, str]) -> Dict[Any, Any]:
    """Set up the backend and load the Ludwig config.

    experiment: dictionary containing the dataset name, config path, and experiment name.
    Returns a Ludwig config.
    """
    shutil.rmtree(os.path.join(os.getcwd(), experiment["dataset_name"]),
                  ignore_errors=True)
    model_config = load_yaml(os.path.join("configs",
                                          experiment["config_path"]))
    model_config["backend"] = {}
    model_config["backend"]["type"] = "local"
    model_config["backend"]["cache_dir"] = os.path.join(
        os.getcwd(), experiment["dataset_name"], "cache")
    os.makedirs(model_config["backend"]["cache_dir"], exist_ok=True)
    return model_config
示例#3
0
def _create_default_config(dataset: Union[str, dd.core.DataFrame, pd.DataFrame,
                                          DatasetInfo],
                           target_name: str = None,
                           time_limit_s: Union[int, float] = None) -> dict:
    """
    Returns auto_train configs for three available combiner models. 
    Coordinates the following tasks:

    - extracts fields and generates list of FieldInfo objects
    - gets field metadata (i.e avg. words, total non-null entries)
    - builds input_features and output_feautures section of config
    - for each combiner, adds default training, hyperopt
    - infers resource constraints and adds gpu and cpu resource allocation per
      trial

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param target_name: (str) name of target feature
    :param time_limit_s: (int, float) total time allocated to auto_train. acts
                                    as the stopping parameter

    # Return
    :return: (dict) dictionaries contain auto train config files for all available
    combiner types

    """
    _ray_init()
    resources = get_available_resources()
    experiment_resources = allocate_experiment_resources(resources)

    dataset_info = dataset
    if not isinstance(dataset, DatasetInfo):
        dataset_info = get_dataset_info(dataset)

    input_and_output_feature_config = get_features_config(
        dataset_info.fields, dataset_info.row_count, resources, target_name)

    model_configs = {}
    for model_name, path_to_defaults in model_defaults.items():
        default_model_config = load_yaml(path_to_defaults)
        default_model_config.update(input_and_output_feature_config)
        default_model_config['hyperopt']['executor'].update(
            experiment_resources)
        default_model_config['hyperopt']['executor'][
            'time_budget_s'] = time_limit_s
        model_configs[model_name] = default_model_config
    return model_configs
示例#4
0
def benchmark(bench_config_path: str) -> None:
    """Launch benchmarking suite from a benchmarking config.

    bench_config_path: config for the benchmarking tool. Specifies datasets and their
        corresponding Ludwig configs, as well as export options.
    """
    config = load_yaml(bench_config_path)
    for experiment in config["datasets"]:
        try:
            if "experiment_name" not in experiment:
                experiment["experiment_name"] = config[
                    "global_experiment_name"]
            benchmark_one_local(experiment,
                                export_artifacts_dict=config["export"][0])
        except Exception:
            print("Benchmarking {} {} failed".format(
                experiment["dataset_name"], experiment["experiment_name"]))
            print(traceback.format_exc())
示例#5
0
def _create_default_config(
    dataset: Union[str, dd.core.DataFrame, pd.DataFrame, DatasetInfo],
    target_name: Union[str, List[str]] = None,
    time_limit_s: Union[int, float] = None,
    random_seed: int = default_random_seed,
) -> dict:
    """Returns auto_train configs for three available combiner models. Coordinates the following tasks:

    - extracts fields and generates list of FieldInfo objects
    - gets field metadata (i.e avg. words, total non-null entries)
    - builds input_features and output_features section of config
    - for each combiner, adds default training, hyperopt
    - infers resource constraints and adds gpu and cpu resource allocation per
      trial

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param target_name: (str, List[str]) name of target feature
    :param time_limit_s: (int, float) total time allocated to auto_train. acts
                                    as the stopping parameter
    :param random_seed: (int, default: `42`) a random seed that will be used anywhere
                        there is a call to a random number generator, including
                        hyperparameter search sampling, as well as data splitting,
                        parameter initialization and training set shuffling

    # Return
    :return: (dict) dictionaries contain auto train config files for all available
    combiner types
    """
    _ray_init()
    resources = get_available_resources()
    experiment_resources = allocate_experiment_resources(resources)

    dataset_info = dataset
    if not isinstance(dataset, DatasetInfo):
        dataset_info = get_dataset_info(dataset)

    input_and_output_feature_config, features_metadata = get_features_config(
        dataset_info.fields, dataset_info.row_count, resources, target_name)
    # create set of all feature types appearing in the dataset
    feature_types = [[feat[TYPE] for feat in features]
                     for features in input_and_output_feature_config.values()]
    feature_types = set(sum(feature_types, []))

    model_configs = {}

    # read in base config and update with experiment resources
    base_automl_config = load_yaml(BASE_AUTOML_CONFIG)
    base_automl_config[HYPEROPT][EXECUTOR].update(experiment_resources)
    base_automl_config[HYPEROPT][EXECUTOR]["time_budget_s"] = time_limit_s
    if time_limit_s is not None:
        base_automl_config[HYPEROPT][EXECUTOR][SCHEDULER][
            "max_t"] = time_limit_s
    base_automl_config[HYPEROPT][SEARCH_ALG]["random_state_seed"] = random_seed
    base_automl_config.update(input_and_output_feature_config)

    model_configs["base_config"] = base_automl_config

    # read in all encoder configs
    for feat_type, default_configs in encoder_defaults.items():
        if feat_type in feature_types:
            if feat_type not in model_configs.keys():
                model_configs[feat_type] = {}
            for encoder_name, encoder_config_path in default_configs.items():
                model_configs[feat_type][encoder_name] = load_yaml(
                    encoder_config_path)

    # read in all combiner configs
    model_configs[COMBINER] = {}
    for combiner_type, default_config in combiner_defaults.items():
        combiner_config = load_yaml(default_config)
        model_configs[COMBINER][combiner_type] = combiner_config

    return model_configs, features_metadata
示例#6
0
def _get_reference_configs() -> dict:
    reference_configs = load_yaml(REFERENCE_CONFIGS)
    return reference_configs
示例#7
0
def _create_default_config(
    dataset: Union[str, dd.core.DataFrame, pd.DataFrame, DatasetInfo],
    target_name: Union[str, List[str]] = None,
    time_limit_s: Union[int, float] = None,
) -> dict:
    """Returns auto_train configs for three available combiner models. Coordinates the following tasks:

    - extracts fields and generates list of FieldInfo objects
    - gets field metadata (i.e avg. words, total non-null entries)
    - builds input_features and output_features section of config
    - for each combiner, adds default training, hyperopt
    - infers resource constraints and adds gpu and cpu resource allocation per
      trial

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param target_name: (str, List[str]) name of target feature
    :param time_limit_s: (int, float) total time allocated to auto_train. acts
                                    as the stopping parameter

    # Return
    :return: (dict) dictionaries contain auto train config files for all available
    combiner types
    """
    _ray_init()
    resources = get_available_resources()
    experiment_resources = allocate_experiment_resources(resources)

    dataset_info = dataset
    if not isinstance(dataset, DatasetInfo):
        dataset_info = get_dataset_info(dataset)

    input_and_output_feature_config = get_features_config(
        dataset_info.fields, dataset_info.row_count, resources, target_name)

    model_configs = {}

    # read in base config and update with experiment resources
    base_automl_config = load_yaml(BASE_AUTOML_CONFIG)
    base_automl_config["hyperopt"]["executor"].update(experiment_resources)
    base_automl_config["hyperopt"]["executor"]["time_budget_s"] = time_limit_s
    if time_limit_s is not None:
        base_automl_config["hyperopt"]["sampler"]["scheduler"][
            "max_t"] = time_limit_s
    base_automl_config.update(input_and_output_feature_config)

    model_configs["base_config"] = base_automl_config

    # read in all encoder configs
    for feat_type, default_configs in encoder_defaults.items():
        if feat_type not in model_configs.keys():
            model_configs[feat_type] = {}
        else:
            for encoder_name, encoder_config_path in default_configs.items():
                model_configs[feat_type][encoder_name] = load_yaml(
                    encoder_config_path)

    # read in all combiner configs
    model_configs["combiner"] = {}
    for combiner_type, default_config in combiner_defaults.items():
        combiner_config = load_yaml(default_config)
        model_configs["combiner"][combiner_type] = combiner_config

    return model_configs
示例#8
0
def experiment_cli(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = "experiment",
    model_name: str = "run",
    model_load_path: str = None,
    model_resume_path: str = None,
    eval_split: str = TEST,
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = False,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_collect_predictions: bool = False,
    skip_collect_overall_stats: bool = False,
    output_directory: str = "results",
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    callbacks: List[Callback] = None,
    backend: Union[Backend, str] = None,
    random_seed: int = default_random_seed,
    debug: bool = False,
    logging_level: int = logging.INFO,
    **kwargs
):
    """Trains a model on a dataset's training and validation splits and uses it to predict on the test split. It
    saves the trained model and the statistics of training and testing.

     # Inputs

     :param config: (Union[str, dict]) in-memory representation of
             config or string path to a YAML config file.
     :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
         source containing the entire dataset to be used in the experiment.
         If it has a split column, it will be used for splitting (0 for train,
         1 for validation, 2 for test), otherwise the dataset will be
         randomly split.
     :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
         source containing training data.
     :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
         source containing validation data.
     :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
         source containing test data.
     :param training_set_metadata: (Union[str, dict], default: `None`)
         metadata JSON file or loaded metadata.  Intermediate preprocessed
         structure containing the mappings of the input
         dataset created the first time an input file is used in the same
         directory with the same name and a '.meta.json' extension.
     :param data_format: (str, default: `None`) format to interpret data
         sources. Will be inferred automatically if not specified.  Valid
         formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
         `'fwf'`, `'hdf5'` (cache file produced during previous training),
         `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
         `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
         `'stata'`, `'tsv'`.
     :param experiment_name: (str, default: `'experiment'`) name for
         the experiment.
     :param model_name: (str, default: `'run'`) name of the model that is
         being used.
     :param model_load_path: (str, default: `None`) if this is specified the
         loaded model will be used as initialization
         (useful for transfer learning).
     :param model_resume_path: (str, default: `None`) resumes training of
         the model from the path specified. The config is restored.
         In addition to config, training statistics and loss for
         epoch and the state of the optimizer are restored such that
         training can be effectively continued from a previously interrupted
         training process.
     :param eval_split: (str, default: `test`) split on which
         to perform evaluation. Valid values are `training`, `validation`
         and `test`.
     :param skip_save_training_description: (bool, default: `False`) disables
         saving the description JSON file.
     :param skip_save_training_statistics: (bool, default: `False`) disables
         saving training statistics JSON file.
     :param skip_save_model: (bool, default: `False`) disables
         saving model weights and hyperparameters each time the model
         improves. By default Ludwig saves model weights after each epoch
         the validation metric improves, but if the model is really big
         that can be time consuming. If you do not want to keep
         the weights and just find out what performance a model can get
         with a set of hyperparameters, use this parameter to skip it,
         but the model will not be loadable later on and the returned model
         will have the weights obtained at the end of training, instead of
         the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
         progress each epoch. By default Ludwig saves weights and stats
         after each epoch for enabling resuming of training, but if
         the model is really big that can be time consuming and will uses
         twice as much space, use this parameter to skip it, but training
         cannot be resumed later on.
     :param skip_save_log: (bool, default: `False`) disables saving
         TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
         but if it is not needed turning it off can slightly increase the
         overall speed.
     :param skip_save_processed_input: (bool, default: `False`) if input
         dataset is provided it is preprocessed and cached by saving an HDF5
         and JSON files to avoid running the preprocessing again. If this
         parameter is `False`, the HDF5 and JSON file are not saved.
     :param skip_save_unprocessed_output: (bool, default: `False`) by default
         predictions and their probabilities are saved in both raw
         unprocessed numpy files containing tensors and as postprocessed
         CSV files (one for each output feature). If this parameter is True,
         only the CSV ones are saved and the numpy ones are skipped.
     :param skip_save_predictions: (bool, default: `False`) skips saving test
         predictions CSV files
     :param skip_save_eval_stats: (bool, default: `False`) skips saving test
         statistics JSON file
    :param skip_collect_predictions: (bool, default: `False`) skips
         collecting post-processed predictions during eval.
     :param skip_collect_overall_stats: (bool, default: `False`) skips
         collecting overall stats during eval.
     :param output_directory: (str, default: `'results'`) the directory that
         will contain the training statistics, TensorBoard logs, the saved
         model and the training progress files.
     :param gpus: (list, default: `None`) list of GPUs that are available
         for training.
     :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
         allocate per GPU device.
     :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
         to use multithreading parallelism to improve performance at
         the cost of determinism.
     :param callbacks: (list, default: `None`) a list of
         `ludwig.callbacks.Callback` objects that provide hooks into the
         Ludwig pipeline.
     :param backend: (Union[Backend, str]) `Backend` or string name
         of backend to use to execute preprocessing / training steps.
     :param random_seed: (int: default: 42) random seed used for weights
         initialization, splits and any other random function.
     :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
         `inf_or_nan` checks.
     :param logging_level: (int) Log level that will be sent to stderr.

     # Return
     :return: (Tuple[LudwigModel, dict, dict, tuple, str)) `(model, evaluation_statistics, training_statistics, preprocessed_data, output_directory)`
         `model` LudwigModel instance
         `evaluation_statistics` dictionary with evaluation performance
             statistics on the test_set,
         `training_statistics` is a dictionary of training statistics for
             each output
         feature containing loss and metrics values for each epoch,
         `preprocessed_data` tuple containing preprocessed
         `(training_set, validation_set, test_set)`, `output_directory`
         filepath string to where results are stored.
    """
    if isinstance(config, str):
        config = load_yaml(config)
    backend = initialize_backend(backend or config.get("backend"))

    if model_load_path:
        model = LudwigModel.load(
            model_load_path,
            logging_level=logging_level,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
        )
    else:
        model = LudwigModel(
            config=config,
            logging_level=logging_level,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
        )
    (eval_stats, train_stats, preprocessed_data, output_directory) = model.experiment(
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        model_resume_path=model_resume_path,
        eval_split=eval_split,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        skip_collect_predictions=skip_collect_predictions,
        skip_collect_overall_stats=skip_collect_overall_stats,
        output_directory=output_directory,
        random_seed=random_seed,
        debug=debug,
    )

    return model, eval_stats, train_stats, preprocessed_data, output_directory