示例#1
0
    def __init__(self,
                 model_definition=None,
                 model_definition_file=None,
                 logging_level=logging.ERROR):
        # check for model_definition and model_definition_file
        if model_definition is None and model_definition_file is None:
            raise ValueError(
                'Either model_definition of model_definition_file have to be'
                'not None to initialize a LudwigModel')
        if model_definition is not None and model_definition_file is not None:
            raise ValueError('Only one between model_definition and '
                             'model_definition_file can be provided')

        self.set_logging_level(logging_level)

        if model_definition_file is not None:
            with open(model_definition_file, 'r') as def_file:
                self.model_definition = merge_with_defaults(
                    yaml.safe_load(def_file))
        else:
            model_definition_copy = copy.deepcopy(model_definition)
            self.model_definition = merge_with_defaults(model_definition_copy)

        self.train_set_metadata = None
        self.model = None
        self.exp_dir_name = ''
示例#2
0
def test_invalid_trainer_type(model_type):
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
        MODEL_TYPE: model_type,
        "trainer": {
            "type": "invalid_trainer"
        },
    }

    with pytest.raises(ValidationError):
        merge_with_defaults(config)
示例#3
0
 def __init__(self,
              model_definition,
              model_definition_file=None,
              logging_level=logging.ERROR):
     logging.getLogger().setLevel(logging_level)
     if model_definition_file is not None:
         with open(model_definition_file, 'r') as def_file:
             self.model_definition = merge_with_defaults(
                 yaml.load(def_file))
     else:
         self.model_definition = merge_with_defaults(model_definition)
     self.train_set_metadata = None
     self.model = None
示例#4
0
 def __init__(self,
              model_definition,
              model_definition_file=None,
              logging_level=logging.ERROR):
     logging.getLogger('ludwig').setLevel(logging_level)
     if model_definition_file is not None:
         with open(model_definition_file, 'r') as def_file:
             self.model_definition = merge_with_defaults(
                 yaml.safe_load(def_file))
     else:
         model_definition_copy = copy.deepcopy(model_definition)
         self.model_definition = merge_with_defaults(model_definition_copy)
     self.train_set_metadata = None
     self.model = None
     self.exp_dir_name = None
def test_global_default_parameters_merge_with_defaults(csv_filename):
    config, _ = _prepare_data(csv_filename)

    updated_config = merge_with_defaults(config)

    assert DEFAULTS in updated_config

    # Make sure no type specific parameters are in preprocessing
    input_feature_types = set(input_type_registry)
    for parameter in updated_config[PREPROCESSING]:
        assert parameter not in input_feature_types

    # All feature-specific preprocessing parameters should be in defaults
    defaults_with_preprocessing = [
        feature for feature in updated_config[DEFAULTS]
        if PREPROCESSING in updated_config[DEFAULTS][feature]
    ]
    assert len(defaults_with_preprocessing) == len(input_feature_types)

    # Feature encoders and decoders should update
    for feature in updated_config[INPUT_FEATURES]:
        assert feature[ENCODER] == updated_config[DEFAULTS][
            feature[TYPE]][ENCODER][TYPE]

    output_feature = updated_config[OUTPUT_FEATURES][0]
    assert output_feature[DECODER] == updated_config[DEFAULTS][
        output_feature[TYPE]][DECODER][TYPE]
示例#6
0
def _setup_ludwig_config(dataset_fp: str) -> Tuple[Dict, str]:
    input_features = [
        text_feature(name="utterance", reduce_output="sum"),
        category_feature(vocab_size=3),
    ]

    output_features = [category_feature(vocab_size=3)]

    rel_path = generate_data(input_features, output_features, dataset_fp)

    config = {
        INPUT_FEATURES: input_features,
        OUTPUT_FEATURES: output_features,
        COMBINER: {
            "type": "concat",
            "num_fc_layers": 2
        },
        TRAINER: {
            "epochs": 2,
            "learning_rate": 0.001
        },
    }

    config = merge_with_defaults(config)

    return config, rel_path
示例#7
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        INPUT_FEATURES: all_input_features,
        OUTPUT_FEATURES: all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINER] = {"batch_size": 42}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT][EXECUTOR][SCHEDULER] = SCHEDULER_DICT

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else ECDTrainerConfig().early_stop
    assert merged_config[TRAINER]["early_stop"] == expected
示例#8
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
示例#9
0
def test_merge_with_defaults_early_stop(use_train, use_hyperopt_scheduler):
    all_input_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        text_feature(),
    ]
    all_output_features = [
        category_feature(),
        sequence_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
        HYPEROPT: HYPEROPT_CONFIG,
    }
    config = copy.deepcopy(config)

    if use_train:
        config[TRAINING] = {"batch_size": "42"}

    if use_hyperopt_scheduler:
        # hyperopt scheduler cannot be used with early stopping
        config[HYPEROPT]["sampler"]["scheduler"] = SCHEDULER

    merged_config = merge_with_defaults(config)

    expected = -1 if use_hyperopt_scheduler else default_early_stop
    assert merged_config[TRAINING]["early_stop"] == expected
示例#10
0
def test_missing_outputs_drop_rows():
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
        DEFAULTS: {
            CATEGORY: {
                PREPROCESSING: {
                    MISSING_VALUE_STRATEGY: FILL_WITH_MODE
                }
            }
        },
    }

    merged_config = merge_with_defaults(config)

    global_preprocessing = merged_config[DEFAULTS]
    input_feature_config = merged_config[INPUT_FEATURES][0]
    output_feature_config = merged_config[OUTPUT_FEATURES][0]

    assert output_feature_config[PREPROCESSING][
        MISSING_VALUE_STRATEGY] == DROP_ROW

    feature_preprocessing = merge_dict(
        global_preprocessing[output_feature_config[TYPE]][PREPROCESSING],
        output_feature_config[PREPROCESSING])
    assert feature_preprocessing[MISSING_VALUE_STRATEGY] == DROP_ROW

    feature_preprocessing = global_preprocessing[
        input_feature_config[TYPE]][PREPROCESSING]
    assert feature_preprocessing[MISSING_VALUE_STRATEGY] == FILL_WITH_MODE
示例#11
0
def test_deprecated_field_aliases():
    config = {
        "input_features": [{"name": "num_in", "type": "numerical"}],
        "output_features": [{"name": "num_out", "type": "numerical"}],
        "training": {
            "epochs": 2,
        },
        "hyperopt": {
            "parameters": {
                "training.learning_rate": {
                    "space": "loguniform",
                    "lower": 0.001,
                    "upper": 0.1,
                },
            },
            "goal": "minimize",
        },
    }

    merged_config = merge_with_defaults(config)

    assert merged_config["input_features"][0][TYPE] == NUMBER
    assert merged_config["output_features"][0][TYPE] == NUMBER

    assert "training" not in merged_config
    assert merged_config[TRAINER]["epochs"] == 2

    hparams = merged_config[HYPEROPT]["parameters"]
    assert "training.learning_rate" not in hparams
    assert "trainer.learning_rate" in hparams
示例#12
0
def get_preprocessing_params(model_definition):
    model_definition = merge_with_defaults(model_definition)

    global_preprocessing_parameters = model_definition['preprocessing']
    features = (
            model_definition['input_features'] +
            model_definition['output_features']
    )

    global_preprocessing_parameters = merge_dict(
        default_preprocessing_parameters,
        global_preprocessing_parameters
    )

    merged_preprocessing_params = []
    for feature in features:
        if 'preprocessing' in feature:
            local_preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing']
            )
        else:
            local_preprocessing_parameters = global_preprocessing_parameters[
                feature['type']
            ]
        merged_preprocessing_params.append(
            (feature['name'], feature['type'], local_preprocessing_parameters)
        )

    return merged_preprocessing_params
示例#13
0
文件: api.py 项目: wbeater/ludwig
def test_train_online(data_csv,
                      model_definition,
                      batch_size=128,
                      debug=False,
                      logging_level=logging.ERROR,
                      **kwargs):
    model_definition = merge_with_defaults(model_definition)
    data, train_set_metadata = build_dataset(
        data_csv, (model_definition['input_features'] +
                   model_definition['output_features']),
        model_definition['preprocessing'])

    ludwig_model = LudwigModel(model_definition, logging_level=logging_level)
    ludwig_model.initialize_model(train_set_metadata=train_set_metadata)

    ludwig_model.train_online(
        data_csv=data_csv,
        batch_size=128,
    )
    ludwig_model.train_online(
        data_csv=data_csv,
        batch_size=128,
    )

    # predict
    predictions = ludwig_model.predict(
        data_csv=data_csv,
        batch_size=batch_size,
    )
    ludwig_model.close()
    logger.critical(predictions)
示例#14
0
def run_hyperopt_executor(
    sampler,
    executor,
    csv_filename,
    ray_mock_dir,
    validate_output_feature=False,
    validation_metric=None,
):
    with ray_start_4_cpus():
        config = _get_config(sampler, executor)

        csv_filename = os.path.join(ray_mock_dir, "dataset.csv")
        dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=100)
        dataset_parquet = create_data_set_to_use("parquet", dataset_csv)

        config = merge_with_defaults(config)

        hyperopt_config = config["hyperopt"]

        if validate_output_feature:
            hyperopt_config["output_feature"] = config["output_features"][0]["name"]
        if validation_metric:
            hyperopt_config["validation_metric"] = validation_metric

        update_hyperopt_params_with_defaults(hyperopt_config)

        parameters = hyperopt_config["parameters"]
        if sampler.get("search_alg", {}).get("type", "") == "bohb":
            # bohb does not support grid_search search space
            del parameters["combiner.num_steps"]

        split = hyperopt_config["split"]
        output_feature = hyperopt_config["output_feature"]
        metric = hyperopt_config["metric"]
        goal = hyperopt_config["goal"]

        hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler)

        # preprocess
        backend = RayBackend(**RAY_BACKEND_KWARGS)
        model = LudwigModel(config=config, backend=backend)
        training_set, validation_set, test_set, training_set_metadata = model.preprocess(
            dataset=dataset_parquet,
        )

        # hyperopt
        hyperopt_executor = MockRayTuneExecutor(hyperopt_sampler, output_feature, metric, split, **executor)
        hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket")

        hyperopt_executor.execute(
            config,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            backend=backend,
            output_directory=ray_mock_dir,
            skip_save_processed_input=True,
            skip_save_unprocessed_output=True,
        )
示例#15
0
def test_hyperopt_executor(sampler,
                           executor,
                           csv_filename,
                           validate_output_feature=False,
                           validation_metric=None):
    if executor['type'] == 'fiber' and sampler['type'] == 'grid':
        # This test is very slow and doesn't give us additional converage
        pytest.skip('Skipping Fiber grid search')

    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    config = merge_with_defaults(config)

    hyperopt_config = HYPEROPT_CONFIG.copy()

    if validate_output_feature:
        hyperopt_config['output_feature'] = output_features[0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(config,
                              dataset=rel_path,
                              gpus=get_available_gpus_cuda_string())
示例#16
0
def test_default_model_type():
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
    }

    merged_config = merge_with_defaults(config)

    assert merged_config[MODEL_TYPE] == MODEL_ECD
示例#17
0
def memory_tune_config(config, dataset):
    fits_in_memory = False
    raw_config = merge_with_defaults(config)
    training_set_metadata = get_trainingset_metadata(raw_config, dataset)
    modified_hyperparam_search_space = copy.deepcopy(
        raw_config[HYPEROPT]["parameters"])
    params_to_modify = RANKED_MODIFIABLE_PARAM_LIST[get_model_name(raw_config)]
    param_list = list(params_to_modify.keys())
    current_param_values = {}
    max_memory = get_machine_memory()

    while param_list is not None:
        # compute memory utilization
        current_param_values = get_new_params(
            current_param_values, modified_hyperparam_search_space,
            params_to_modify)
        temp_config = sub_new_params(raw_config, current_param_values)
        if compute_memory_usage(temp_config,
                                training_set_metadata) < max_memory:
            fits_in_memory = True
            break
        # check if we have exhausted tuning of current param (e.g. we can no longer reduce the param value)
        param, min_value = param_list[0], params_to_modify[param_list[0]]

        if param in modified_hyperparam_search_space.keys():
            param_space = modified_hyperparam_search_space[param]["space"]
            if param_space == "choice":
                if (len(modified_hyperparam_search_space[param]["categories"])
                        > 2 and modified_hyperparam_search_space[param]
                    ["categories"][-2] > min_value):
                    modified_hyperparam_search_space[param][
                        "categories"] = modified_hyperparam_search_space[
                            param]["categories"][:-1]
                else:
                    param_list.pop(0)  # exhausted reduction of this parameter
            else:
                # reduce by 10%
                upper_bound, lower_bound = (
                    modified_hyperparam_search_space[param]["upper"],
                    modified_hyperparam_search_space[param]["lower"],
                )
                reduction_val = (upper_bound - lower_bound) * 0.1
                new_upper_bound = upper_bound - reduction_val
                if (new_upper_bound
                    ) > lower_bound and new_upper_bound > min_value:
                    modified_hyperparam_search_space[param][
                        "upper"] = new_upper_bound
                else:
                    param_list.pop(0)  # exhausted reduction of this parameter
        else:
            param_list.pop(0)  # param not in hyperopt search space

    modified_config = copy.deepcopy(config)

    modified_config[HYPEROPT]["parameters"] = modified_hyperparam_search_space
    return modified_config, fits_in_memory
示例#18
0
def test_deprecated_field_aliases():
    config = {
        INPUT_FEATURES: [{
            "name": "num_in",
            "type": "numerical"
        }],
        OUTPUT_FEATURES: [{
            "name": "num_out",
            "type": "numerical"
        }],
        "training": {
            "epochs": 2,
            "eval_batch_size": 0,
        },
        HYPEROPT: {
            "parameters": {
                "training.learning_rate": {
                    "space": "loguniform",
                    "lower": 0.001,
                    "upper": 0.1,
                },
            },
            "goal": "minimize",
            "sampler": {
                "type": "grid",
                "num_samples": 2,
                "scheduler": {
                    "type": "fifo"
                }
            },
            "executor": {
                "type": "grid",
                "search_alg": "bohb",
            },
        },
    }

    merged_config = merge_with_defaults(config)

    assert merged_config["input_features"][0][TYPE] == NUMBER
    assert merged_config["output_features"][0][TYPE] == NUMBER

    assert "training" not in merged_config
    assert merged_config[TRAINER]["epochs"] == 2
    assert merged_config[TRAINER][EVAL_BATCH_SIZE] is None

    hparams = merged_config[HYPEROPT]["parameters"]
    assert "training.learning_rate" not in hparams
    assert "trainer.learning_rate" in hparams

    assert "sampler" not in merged_config[HYPEROPT]

    assert merged_config[HYPEROPT]["executor"]["type"] == "ray"
    assert "num_samples" in merged_config[HYPEROPT]["executor"]
    assert "scheduler" in merged_config[HYPEROPT]["executor"]
示例#19
0
文件: api.py 项目: prmrreddy/ludwig
    def __init__(self,
                 model_definition,
                 logging_level=logging.ERROR,
                 use_horovod=None,
                 gpus=None,
                 gpu_memory_limit=None,
                 allow_parallel_threads=True,
                 random_seed=default_random_seed):
        """
        :param model_definition: (dict, string) in-memory representation of model definition
               or string path to the saved JSON model definition file.
        :param model_definition_fp: (string) path to user-defined definition YAML file.
        :param logging_level: Log level that will be sent to stderr.
        :param use_horovod: (bool) use Horovod for distributed training. Will be set
               automatically if `horovodrun` is used to launch the training script.
        :param gpus: (string, default: `None`) list of GPUs to use (it uses the
               same syntax of CUDA_VISIBLE_DEVICES)
        :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate
              per GPU device.
        :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use
               multithreading parallelism to improve performance at the cost of
               determinism.
        """
        # check if model definition is a path or a dict
        if isinstance(model_definition, str):  # assume path
            with open(model_definition, 'r') as def_file:
                model_definition_dict = yaml.safe_load(def_file)
            self.model_definition_fp = model_definition
        else:
            model_definition_dict = copy.deepcopy(model_definition)
            self.model_definition_fp = None

        # merge model definition with defaults
        self.model_definition = merge_with_defaults(model_definition_dict)

        # setup horovod
        self._horovod = configure_horovod(use_horovod)

        # setup logging
        self.set_logging_level(logging_level)

        # setup TensorFlow
        initialize_tensorflow(gpus, gpu_memory_limit, allow_parallel_threads,
                              self._horovod)
        # todo refactoring: decide where to put this,
        #  here or at the beginning of training.
        #  Either way make sure it is called before the model is initialized.
        # tf.random.set_seed(random_seed)

        # setup model
        self.model = None
        self.training_set_metadata = None

        # online training state
        self._online_trainer = None
示例#20
0
def test_hyperopt_executor(sampler,
                           executor,
                           csv_filename,
                           validate_output_feature=False,
                           validation_metric=None):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    model_definition = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    model_definition = merge_with_defaults(model_definition)

    hyperopt_config = HYPEROPT_CONFIG.copy()

    if validate_output_feature:
        hyperopt_config['output_feature'] = output_features[0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(model_definition,
                              dataset=rel_path,
                              gpus=get_available_gpus_cuda_string())
示例#21
0
def test_default_trainer_type(model_trainer_type):
    model_type, expected_trainer_type = model_trainer_type
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
        MODEL_TYPE: model_type,
    }

    merged_config = merge_with_defaults(config)

    assert merged_config[TRAINER][TYPE] == expected_trainer_type
示例#22
0
def test_hyperopt_executor(sampler,
                           executor,
                           csv_filename,
                           validate_output_feature=False,
                           validation_metric=None):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        TRAINER: {
            "epochs": 2,
            "learning_rate": 0.001
        },
    }

    config = merge_with_defaults(config)

    hyperopt_config = HYPEROPT_CONFIG.copy()

    if validate_output_feature:
        hyperopt_config["output_feature"] = output_features[0]["name"]
    if validation_metric:
        hyperopt_config["validation_metric"] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    gpus = [i for i in range(torch.cuda.device_count())]
    hyperopt_executor.execute(config, dataset=rel_path, gpus=gpus)
示例#23
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
示例#24
0
文件: test_schema.py 项目: cxz/ludwig
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
示例#25
0
def run_hyperopt_executor(sampler, executor, csv_filename,
                          validate_output_feature=False,
                          validation_metric=None):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        "training": {"epochs": 2, "learning_rate": 0.001},
        "hyperopt": {
            **HYPEROPT_CONFIG,
            "executor": executor,
            "sampler": sampler,
        },
    }

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config['output_feature'] = output_features[0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == 'bohb':
        # bohb does not support grid_search search space
        del parameters['utterance.cell_type']

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(
        sampler["type"])(goal, parameters, **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(config, dataset=rel_path)
示例#26
0
def run_hyperopt_executor(
    search_alg,
    executor,
    csv_filename,
    tmpdir,
    validate_output_feature=False,
    validation_metric=None,
    use_split=True,
):
    config = _get_config(search_alg, executor)
    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)

    if not use_split:
        df = pd.read_csv(rel_path)
        df["split"] = 0
        df.to_csv(rel_path)

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config["output_feature"] = config["output_features"][0]["name"]
    if validation_metric:
        hyperopt_config["validation_metric"] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if search_alg.get("type", "") == "bohb":
        # bohb does not support grid_search search space
        del parameters["utterance.cell_type"]
        hyperopt_config["parameters"] = parameters

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]
    search_alg = hyperopt_config["search_alg"]

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        parameters, output_feature, metric, goal, split, search_alg=search_alg, **executor
    )

    hyperopt_executor.execute(
        config,
        dataset=rel_path,
        output_directory=tmpdir,
        backend="local",
    )
示例#27
0
def test_overwrite_trainer_type():
    expected_trainer_type = "ray_legacy_trainer"
    config = {
        INPUT_FEATURES: [category_feature()],
        OUTPUT_FEATURES: [category_feature()],
        MODEL_TYPE: MODEL_ECD,
        "trainer": {
            "type": expected_trainer_type
        },
    }

    merged_config = merge_with_defaults(config)

    assert merged_config[TRAINER][TYPE] == expected_trainer_type
示例#28
0
def run_hyperopt_executor(
    sampler, executor, csv_filename, ray_mock_dir,
    validate_output_feature=False,
    validation_metric=None,
):
    config = _get_config(sampler, executor)

    csv_filename = os.path.join(ray_mock_dir, 'dataset.csv')
    dataset_csv = generate_data(
        config['input_features'], config['output_features'], csv_filename, num_examples=100)
    dataset_parquet = create_data_set_to_use('parquet', dataset_csv)

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config['output_feature'] = config['output_features'][0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == 'bohb':
        # bohb does not support grid_search search space
        del parameters['combiner.num_steps']

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(
        sampler["type"])(goal, parameters, **sampler)

    hyperopt_executor = MockRayTuneExecutor(
        hyperopt_sampler, output_feature, metric, split, **executor)
    hyperopt_executor.mock_path = os.path.join(ray_mock_dir, "bucket")

    hyperopt_executor.execute(
        config,
        dataset=dataset_parquet,
        backend=RayBackend(processor={'parallelism': 4,}),
        output_directory=ray_mock_dir,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=True
    )
示例#29
0
def test_hyperopt_executor(csv_filename):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    model_definition = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    model_definition = merge_with_defaults(model_definition)

    input_features = model_definition["input_features"]
    output_features = model_definition["output_features"]
    hyperopt_config = HYPEROPT_CONFIG

    update_hyperopt_params_with_defaults(hyperopt_config)

    strategy = hyperopt_config["strategy"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_strategy = get_build_hyperopt_strategy(strategy["type"])(
        goal, parameters, **strategy)

    for executor in EXECUTORS:
        hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
            hyperopt_strategy, output_feature, metric, split, **executor)

        hyperopt_executor.execute(model_definition, data_csv=rel_path)
示例#30
0
def run_hyperopt_executor(
    sampler,
    executor,
    csv_filename,
    validate_output_feature=False,
    validation_metric=None,
):
    config = _get_config(sampler, executor)
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config["output_feature"] = config["output_features"][0][
            "name"]
    if validation_metric:
        hyperopt_config["validation_metric"] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == "bohb":
        # bohb does not support grid_search search space
        del parameters["utterance.cell_type"]

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(
        config,
        dataset=rel_path,
        backend="local",
    )