示例#1
0
def _create_default_config(dataset: Union[str, dd.core.DataFrame, pd.DataFrame,
                                          DatasetInfo],
                           target_name: str = None,
                           time_limit_s: Union[int, float] = None) -> dict:
    """
    Returns auto_train configs for three available combiner models. 
    Coordinates the following tasks:

    - extracts fields and generates list of FieldInfo objects
    - gets field metadata (i.e avg. words, total non-null entries)
    - builds input_features and output_feautures section of config
    - for each combiner, adds default training, hyperopt
    - infers resource constraints and adds gpu and cpu resource allocation per
      trial

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param target_name: (str) name of target feature
    :param time_limit_s: (int, float) total time allocated to auto_train. acts
                                    as the stopping parameter

    # Return
    :return: (dict) dictionaries contain auto train config files for all available
    combiner types

    """
    _ray_init()
    resources = get_available_resources()
    experiment_resources = allocate_experiment_resources(resources)

    dataset_info = dataset
    if not isinstance(dataset, DatasetInfo):
        dataset_info = get_dataset_info(dataset)

    input_and_output_feature_config = get_features_config(
        dataset_info.fields, dataset_info.row_count, resources, target_name)

    model_configs = {}
    for model_name, path_to_defaults in model_defaults.items():
        default_model_config = load_yaml(path_to_defaults)
        default_model_config.update(input_and_output_feature_config)
        default_model_config['hyperopt']['executor'].update(
            experiment_resources)
        default_model_config['hyperopt']['executor'][
            'time_budget_s'] = time_limit_s
        model_configs[model_name] = default_model_config
    return model_configs
示例#2
0
def train_with_config(
    dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
    config: dict,
    output_directory: str = OUTPUT_DIR,
    random_seed: int = default_random_seed,
    **kwargs,
) -> AutoTrainResults:
    """Performs hyperparameter optimization with respect to the given config and selects the best model.

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param config: (dict) optional Ludwig configuration to use for training, defaults
                   to `create_auto_config`.
    :param output_directory: (str) directory into which to write results, defaults to
        current working directory.
    :param random_seed: (int, default: `42`) a random seed that will be used anywhere
                        there is a call to a random number generator, including
                        hyperparameter search sampling, as well as data splitting,
                        parameter initialization and training set shuffling

    # Returns
    :return: (AutoTrainResults) results containing hyperopt experiments and best model
    """
    _ray_init()
    model_type = get_model_type(config)
    hyperopt_results = _train(config,
                              dataset,
                              output_directory=output_directory,
                              model_name=model_type,
                              random_seed=random_seed,
                              **kwargs)
    # catch edge case where metric_score is nan
    # TODO (ASN): Decide how we want to proceed if at least one trial has
    # completed
    for trial in hyperopt_results.ordered_trials:
        if np.isnan(trial.metric_score):
            warnings.warn(
                "There was an error running the experiment. "
                "A trial failed to start. "
                "Consider increasing the time budget for experiment. ")

    experiment_analysis = hyperopt_results.experiment_analysis
    return AutoTrainResults(experiment_analysis)
示例#3
0
def train_with_config(
    dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
    config: dict,
    output_directory: str = OUTPUT_DIR,
    **kwargs,
) -> AutoTrainResults:
    """
    Performs hyperparameter optimization with respect to the given config
    and selects the best model.

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param config: (dict) optional Ludwig configuration to use for training, defaults
                   to `create_auto_config`.
    :param output_directory: (str) directory into which to write results, defaults to
        current working directory.

    # Returns
    :return: (AutoTrainResults) results containing hyperopt experiments and best model
    """
    _ray_init()
    model_name = config[COMBINER][TYPE]
    hyperopt_results = _train(config,
                              dataset,
                              output_directory=output_directory,
                              model_name=model_name,
                              **kwargs)
    # catch edge case where metric_score is nan
    # TODO (ASN): Decide how we want to proceed if at least one trial has
    # completed
    for trial in hyperopt_results.ordered_trials:
        if np.isnan(trial.metric_score):
            warnings.warn(
                "There was an error running the experiment. "
                "A trial failed to start. "
                "Consider increasing the time budget for experiment. ")

    experiment_analysis = hyperopt_results.experiment_analysis
    return AutoTrainResults(experiment_analysis)
示例#4
0
def _create_default_config(
    dataset: Union[str, dd.core.DataFrame, pd.DataFrame, DatasetInfo],
    target_name: Union[str, List[str]] = None,
    time_limit_s: Union[int, float] = None,
) -> dict:
    """Returns auto_train configs for three available combiner models. Coordinates the following tasks:

    - extracts fields and generates list of FieldInfo objects
    - gets field metadata (i.e avg. words, total non-null entries)
    - builds input_features and output_features section of config
    - for each combiner, adds default training, hyperopt
    - infers resource constraints and adds gpu and cpu resource allocation per
      trial

    # Inputs
    :param dataset: (str) filepath to dataset.
    :param target_name: (str, List[str]) name of target feature
    :param time_limit_s: (int, float) total time allocated to auto_train. acts
                                    as the stopping parameter

    # Return
    :return: (dict) dictionaries contain auto train config files for all available
    combiner types
    """
    _ray_init()
    resources = get_available_resources()
    experiment_resources = allocate_experiment_resources(resources)

    dataset_info = dataset
    if not isinstance(dataset, DatasetInfo):
        dataset_info = get_dataset_info(dataset)

    input_and_output_feature_config = get_features_config(
        dataset_info.fields, dataset_info.row_count, resources, target_name)

    model_configs = {}

    # read in base config and update with experiment resources
    base_automl_config = load_yaml(BASE_AUTOML_CONFIG)
    base_automl_config["hyperopt"]["executor"].update(experiment_resources)
    base_automl_config["hyperopt"]["executor"]["time_budget_s"] = time_limit_s
    if time_limit_s is not None:
        base_automl_config["hyperopt"]["sampler"]["scheduler"][
            "max_t"] = time_limit_s
    base_automl_config.update(input_and_output_feature_config)

    model_configs["base_config"] = base_automl_config

    # read in all encoder configs
    for feat_type, default_configs in encoder_defaults.items():
        if feat_type not in model_configs.keys():
            model_configs[feat_type] = {}
        else:
            for encoder_name, encoder_config_path in default_configs.items():
                model_configs[feat_type][encoder_name] = load_yaml(
                    encoder_config_path)

    # read in all combiner configs
    model_configs["combiner"] = {}
    for combiner_type, default_config in combiner_defaults.items():
        combiner_config = load_yaml(default_config)
        model_configs["combiner"][combiner_type] = combiner_config

    return model_configs